Files
notify-bridge/packages/server/src/notify_bridge_server/api/backup.py
T
alexei.dolgolyov 56993d2ca3 fix(security,perf): harden restore, CSRF, token_version + perf pass
Security
- Sign pending_restore.json (SHA256 stored in AppSetting, verified on
  startup apply) + refuse path outside data_dir, tighten to 0600.
- Require same-origin Origin/Referer on POST /api/backup/apply-restart —
  Bearer-in-localStorage is CSRF-reachable from any XSS'd admin tab.
- Bump token_version on role/username change and admin password reset so
  demoted admins lose admin in already-issued JWTs.  Guard last-admin
  TOCTOU via COUNT + post-commit re-check that rolls back a race.
- SSRF guard (validate_outbound_url) in ImmichClient.__init__ and the
  external_domain setter — admin-mutable URLs were bypassing the check
  that webhook/slack/discord paths already used.  Dev restart script now
  sets NOTIFY_BRIDGE_ALLOW_PRIVATE_URLS=1 so homelab Immich still works.
- Redact + cap Immich error bodies to ~120 chars before they flow into
  ActionExecution.error / EventLog.details (both UI-visible).
- Deny-list sensitive keys (api_key / token / secret / password /
  authorization / cookie / ...) in template-context merges so a rogue
  template can't exfiltrate provider creds via {{ api_key }}.
- Cap user-controlled Immich search params (query ≤256, person_ids ≤50,
  size ≤100) so a Telegram listener can't DoS upstream.
- Stream upload reads with running byte counter + content-length precheck
  instead of buffering the full body and then rejecting.
- Log Telegram parse_mode fallbacks instead of swallowing silently;
  template escape bugs now surface in server logs.
- Rollback partial imports on pending-restore failure (error recorded on
  a fresh session).

Performance
- Fix N+1 in _refresh_telegram_chat_titles: single IN query instead of
  session.get per chat.
- Parallelize album + shared-link fetches in test_dispatch (asyncio.gather)
  and per-receiver Telegram test sends in notifier (semaphore 5).
- Early-exit collect_scheduled_assets(limit=0) so the periodic-summary
  test path skips full per-album filter/sample (was O(album_assets)).
- Emit explicit CREATE INDEX IF NOT EXISTS for event_log user_id /
  action_id / provider_id so the first boot after upgrade isn't left
  unindexed for the dashboard query.
- Add AbortController timeout (120s) to fetchAuth so uploads/downloads
  don't hang indefinitely.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-22 02:28:55 +03:00

546 lines
19 KiB
Python

"""Configuration backup/restore API (admin only)."""
import asyncio
import hashlib
import json
import logging
import os
import signal
from datetime import datetime, timezone
from urllib.parse import urlparse
from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, Request, UploadFile, File, Query
from fastapi.responses import JSONResponse
from sqlmodel.ext.asyncio.session import AsyncSession
from ..auth.dependencies import require_admin
from ..config import settings as app_config
from ..database.engine import get_session
from ..database.models import AppSetting, User
from ..services.backup_schema import (
ALL_CATEGORIES, BackupCategory, BackupFile, ConflictMode, SecretsMode,
)
from ..services.backup_service import (
cleanup_old_backups, export_backup, export_backup_to_file, import_backup,
list_backup_files, validate_backup,
)
# Pending-restore marker keys (single source of truth consumed at startup)
PENDING_RESTORE_PATH_KEY = "pending_restore_path"
PENDING_RESTORE_CONFLICT_KEY = "pending_restore_conflict_mode"
PENDING_RESTORE_UPLOADED_AT_KEY = "pending_restore_uploaded_at"
PENDING_RESTORE_UPLOADED_BY_KEY = "pending_restore_uploaded_by"
# SHA256 of the staged pending_restore.json, written atomically with the file.
# The startup hook refuses to apply if the on-disk file's hash does not match —
# defends against anyone dropping a tampered file into data/ between prepare
# and restart.
PENDING_RESTORE_SHA256_KEY = "pending_restore_sha256"
def _pending_restore_path():
return app_config.data_dir / "pending_restore.json"
def _applied_restores_dir():
return app_config.data_dir / "applied_restores"
_LOGGER = logging.getLogger(__name__)
router = APIRouter(prefix="/api/backup", tags=["backup"])
MAX_UPLOAD_SIZE = 10 * 1024 * 1024 # 10 MB
async def _read_upload_bounded(file: UploadFile, max_bytes: int = MAX_UPLOAD_SIZE) -> bytes:
"""Read an UploadFile into memory, failing fast if it exceeds ``max_bytes``.
Rejects on ``content_length`` header up-front when available; always
stream-reads with a running byte counter so we never allocate more than
the limit even when the header is missing or lies.
"""
# Fast path: reject on header before we allocate anything.
cl = file.headers.get("content-length") if hasattr(file, "headers") else None
if cl:
try:
if int(cl) > max_bytes:
raise HTTPException(status_code=400, detail="File too large (max 10 MB)")
except ValueError:
pass
chunks: list[bytes] = []
total = 0
while True:
chunk = await file.read(64 * 1024)
if not chunk:
break
total += len(chunk)
if total > max_bytes:
raise HTTPException(status_code=400, detail="File too large (max 10 MB)")
chunks.append(chunk)
return b"".join(chunks)
def _check_same_origin(request: Request) -> None:
"""Reject cross-origin admin-write POSTs (CSRF defense).
Bearer tokens in ``localStorage`` plus cookie-less CORS mean a malicious
page cannot technically submit our Authorization header from a victim's
session, BUT browser extensions and misconfigured CORS policies routinely
break this assumption. For endpoints whose blast radius is restart/RCE-
equivalent (restore apply), we additionally require the request to come
from our own origin.
"""
host = request.headers.get("host", "").lower()
if not host:
raise HTTPException(status_code=400, detail="Missing Host header")
def _host_of(u: str | None) -> str:
if not u:
return ""
try:
return (urlparse(u).netloc or "").lower()
except Exception: # noqa: BLE001
return ""
origin_host = _host_of(request.headers.get("origin"))
referer_host = _host_of(request.headers.get("referer"))
# At least one of Origin/Referer must be present and match Host.
# Legitimate browser requests to this endpoint always ship Origin.
same = (origin_host and origin_host == host) or (referer_host and referer_host == host)
if not same:
raise HTTPException(
status_code=403,
detail="Cross-origin request rejected",
)
def _backup_dir():
return app_config.data_dir / "backups"
def _resolve_backup_file(filename: str):
"""Validate filename and resolve to a path strictly inside the backup dir."""
if not filename.startswith("backup-") or not filename.endswith(".json"):
raise HTTPException(status_code=404, detail="Backup file not found")
if "/" in filename or "\\" in filename or ".." in filename or "\x00" in filename:
raise HTTPException(status_code=404, detail="Backup file not found")
base = _backup_dir().resolve()
candidate = (base / filename).resolve()
try:
candidate.relative_to(base)
except ValueError:
raise HTTPException(status_code=404, detail="Backup file not found")
if not candidate.is_file():
raise HTTPException(status_code=404, detail="Backup file not found")
return candidate
# ---------------------------------------------------------------------------
# Export
# ---------------------------------------------------------------------------
@router.get("/export")
async def export_config(
secrets_mode: SecretsMode = Query(default=SecretsMode.EXCLUDE),
categories: str = Query(default=""),
user: User = Depends(require_admin),
session: AsyncSession = Depends(get_session),
):
"""Export configuration as a downloadable JSON file."""
cats = None
if categories:
try:
cats = [BackupCategory(c.strip()) for c in categories.split(",") if c.strip()]
except ValueError as e:
raise HTTPException(status_code=400, detail=f"Invalid category: {e}")
backup = await export_backup(session, user.id, cats, secrets_mode)
ts = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H-%M-%S")
filename = f"notify-bridge-backup-{ts}.json"
return JSONResponse(
content=backup.model_dump(),
headers={"Content-Disposition": f'attachment; filename="{filename}"'},
)
# ---------------------------------------------------------------------------
# Validate
# ---------------------------------------------------------------------------
@router.post("/validate")
async def validate_config(
file: UploadFile = File(...),
user: User = Depends(require_admin),
):
"""Validate a backup file without importing."""
content = await _read_upload_bounded(file)
try:
raw = json.loads(content)
except json.JSONDecodeError as e:
raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}")
result = validate_backup(raw)
return result.model_dump()
# ---------------------------------------------------------------------------
# Import
# ---------------------------------------------------------------------------
@router.post("/import")
async def import_config(
file: UploadFile = File(...),
conflict_mode: ConflictMode = Query(default=ConflictMode.SKIP),
user: User = Depends(require_admin),
session: AsyncSession = Depends(get_session),
):
"""Import configuration from a backup file."""
content = await _read_upload_bounded(file)
try:
raw = json.loads(content)
except json.JSONDecodeError as e:
raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}")
# Validate first
validation = validate_backup(raw)
if not validation.valid:
raise HTTPException(status_code=400, detail=f"Invalid backup: {'; '.join(validation.errors)}")
backup = BackupFile.model_validate(raw)
result = await import_backup(session, user.id, backup, conflict_mode)
return result.model_dump()
# ---------------------------------------------------------------------------
# Pending restore (prepare → apply on next restart)
# ---------------------------------------------------------------------------
async def _set_app_setting(session: AsyncSession, key: str, value: str) -> None:
row = await session.get(AppSetting, key)
if row:
row.value = value
else:
row = AppSetting(key=key, value=value)
session.add(row)
async def _clear_pending_restore_markers(session: AsyncSession) -> None:
for key in (
PENDING_RESTORE_PATH_KEY,
PENDING_RESTORE_CONFLICT_KEY,
PENDING_RESTORE_UPLOADED_AT_KEY,
PENDING_RESTORE_UPLOADED_BY_KEY,
PENDING_RESTORE_SHA256_KEY,
):
row = await session.get(AppSetting, key)
if row:
await session.delete(row)
@router.post("/prepare-restore")
async def prepare_restore(
file: UploadFile = File(...),
conflict_mode: ConflictMode = Query(default=ConflictMode.SKIP),
user: User = Depends(require_admin),
session: AsyncSession = Depends(get_session),
):
"""Stage a backup for restore on next backend restart.
Validates the uploaded file, writes it to ``data/pending_restore.json``,
and persists marker settings so startup will apply it atomically.
"""
content = await _read_upload_bounded(file)
try:
raw = json.loads(content)
except json.JSONDecodeError as e:
raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}")
validation = validate_backup(raw)
if not validation.valid:
raise HTTPException(
status_code=400,
detail=f"Invalid backup: {'; '.join(validation.errors)}",
)
pending_path = _pending_restore_path()
pending_path.parent.mkdir(parents=True, exist_ok=True)
# Atomic write: write to tmp then rename, so a crash mid-write never
# leaves a truncated pending_restore.json that would break startup apply.
payload = json.dumps(raw).encode("utf-8")
digest = hashlib.sha256(payload).hexdigest()
tmp_path = pending_path.with_suffix(pending_path.suffix + ".tmp")
tmp_path.write_bytes(payload)
os.replace(tmp_path, pending_path)
# Best-effort tighten perms so a non-root local user cannot swap the file
# for one they control between prepare and restart. On Windows this is a
# no-op; on POSIX we restrict to owner-only rw.
try:
os.chmod(pending_path, 0o600)
except OSError:
pass
now_iso = datetime.now(timezone.utc).isoformat()
await _set_app_setting(session, PENDING_RESTORE_PATH_KEY, str(pending_path))
await _set_app_setting(session, PENDING_RESTORE_CONFLICT_KEY, conflict_mode.value)
await _set_app_setting(session, PENDING_RESTORE_UPLOADED_AT_KEY, now_iso)
await _set_app_setting(session, PENDING_RESTORE_UPLOADED_BY_KEY, user.username)
await _set_app_setting(session, PENDING_RESTORE_SHA256_KEY, digest)
await session.commit()
return {
"pending": True,
"uploaded_at": now_iso,
"uploaded_by": user.username,
"conflict_mode": conflict_mode.value,
"validation": validation.model_dump(),
"supervised": _is_supervised(),
}
@router.get("/pending-restore")
async def get_pending_restore(
user: User = Depends(require_admin),
session: AsyncSession = Depends(get_session),
):
"""Return current pending-restore state, or null if none."""
path_row = await session.get(AppSetting, PENDING_RESTORE_PATH_KEY)
if not path_row or not path_row.value:
return {"pending": False, "supervised": _is_supervised()}
conflict_row = await session.get(AppSetting, PENDING_RESTORE_CONFLICT_KEY)
uploaded_at_row = await session.get(AppSetting, PENDING_RESTORE_UPLOADED_AT_KEY)
uploaded_by_row = await session.get(AppSetting, PENDING_RESTORE_UPLOADED_BY_KEY)
return {
"pending": True,
"uploaded_at": uploaded_at_row.value if uploaded_at_row else None,
"uploaded_by": uploaded_by_row.value if uploaded_by_row else None,
"conflict_mode": (conflict_row.value if conflict_row else ConflictMode.SKIP.value),
"supervised": _is_supervised(),
}
@router.delete("/pending-restore")
async def cancel_pending_restore(
user: User = Depends(require_admin),
session: AsyncSession = Depends(get_session),
):
"""Cancel a pending restore."""
pending_path = _pending_restore_path()
if pending_path.exists():
pending_path.unlink()
await _clear_pending_restore_markers(session)
await session.commit()
return {"cancelled": True}
def _is_supervised() -> bool:
"""Heuristic: is this process managed by something that will respawn it?
Priority order:
1. Explicit operator override: ``NOTIFY_BRIDGE_SUPERVISED`` env var or
the ``supervised`` AppSetting (values: ``true``/``false``/``auto``).
``auto`` (or unset) falls through to the detection heuristic.
2. Heuristic: look at common container/service-manager env vars.
Used by the frontend to decide whether to offer "Restart now" — a bad
guess here is a foot-gun (process exits, stays dead), so err on the side
of false when unsure.
"""
override = os.environ.get("NOTIFY_BRIDGE_SUPERVISED", "").strip().lower()
if override in ("true", "1", "yes", "on"):
return True
if override in ("false", "0", "no", "off"):
return False
for var in ("CONTAINER", "DOCKER_CONTAINER", "KUBERNETES_SERVICE_HOST",
"INVOCATION_ID", "PM2_HOME"):
if os.environ.get(var):
return True
if os.path.exists("/.dockerenv"):
return True
return False
@router.post("/apply-restart")
async def apply_and_restart(
request: Request,
background_tasks: BackgroundTasks,
user: User = Depends(require_admin),
session: AsyncSession = Depends(get_session),
):
"""Trigger a graceful exit so the supervisor respawns and applies the pending restore.
Only allowed when a pending restore is staged AND the process is supervised.
Requires same-origin Origin/Referer — this endpoint's blast radius is a
full config replace + restart, so an admin token alone (vulnerable to
XSS-driven CSRF) is not enough.
"""
_check_same_origin(request)
path_row = await session.get(AppSetting, PENDING_RESTORE_PATH_KEY)
if not path_row or not path_row.value:
raise HTTPException(status_code=409, detail="No pending restore to apply")
if not _is_supervised():
raise HTTPException(
status_code=409,
detail=(
"This process is not supervised. Restart the backend manually to apply "
"the pending restore, or use the Cancel button."
),
)
async def _shutdown_soon() -> None:
# Small delay so the HTTP response flushes before the signal fires.
await asyncio.sleep(0.5)
_LOGGER.warning("Admin triggered restart to apply pending restore")
# SIGTERM lets uvicorn run its normal graceful shutdown:
# drain in-flight requests, fire the lifespan shutdown hooks
# (close_http_session, scheduler.shutdown), then exit. The
# supervisor respawns, and startup applies the pending restore.
try:
os.kill(os.getpid(), signal.SIGTERM)
except Exception: # noqa: BLE001 — last-resort fallback on platforms that reject SIGTERM
_LOGGER.exception("SIGTERM delivery failed; falling back to os._exit")
os._exit(0)
background_tasks.add_task(_shutdown_soon)
return {"restart_requested": True}
# ---------------------------------------------------------------------------
# Scheduled backup settings
# ---------------------------------------------------------------------------
_BACKUP_SETTING_KEYS = {
"backup_scheduled_enabled": "false",
"backup_scheduled_interval_hours": "24",
"backup_secrets_mode": "exclude",
"backup_retention_count": "5",
}
@router.get("/scheduled")
async def get_scheduled_settings(
user: User = Depends(require_admin),
session: AsyncSession = Depends(get_session),
):
"""Get scheduled backup settings."""
result = {}
for key, default in _BACKUP_SETTING_KEYS.items():
row = await session.get(AppSetting, key)
result[key] = row.value if row and row.value else default
return result
@router.put("/scheduled")
async def update_scheduled_settings(
body: dict,
user: User = Depends(require_admin),
session: AsyncSession = Depends(get_session),
):
"""Update scheduled backup settings and reschedule."""
for key, default in _BACKUP_SETTING_KEYS.items():
value = body.get(key)
if value is None:
continue
row = await session.get(AppSetting, key)
if row:
row.value = str(value)
else:
row = AppSetting(key=key, value=str(value))
session.add(row)
await session.commit()
# Reschedule backup job
from ..services.scheduler import schedule_backup
enabled = body.get("backup_scheduled_enabled", "false") == "true"
interval_hours = int(body.get("backup_scheduled_interval_hours", "24"))
if enabled:
await schedule_backup(interval_hours)
else:
from ..services.scheduler import unschedule_backup
await unschedule_backup()
# Return updated settings
result = {}
for key, default in _BACKUP_SETTING_KEYS.items():
row = await session.get(AppSetting, key)
result[key] = row.value if row and row.value else default
return result
# ---------------------------------------------------------------------------
# Backup file management
# ---------------------------------------------------------------------------
@router.get("/files")
async def get_backup_files(
user: User = Depends(require_admin),
):
"""List saved backup files."""
return list_backup_files(_backup_dir())
@router.post("/files")
async def create_manual_backup(
secrets_mode: SecretsMode = Query(default=SecretsMode.EXCLUDE),
user: User = Depends(require_admin),
session: AsyncSession = Depends(get_session),
):
"""Create a backup file in the backups directory (manual checkpoint).
Produces the same JSON format as scheduled backups, saved under
``data/backups/backup-<timestamp>.json``. Retention is managed by the
existing scheduled-backup settings (``backup_retention_count``).
"""
backup_dir = _backup_dir()
filepath = await export_backup_to_file(session, user.id, backup_dir, secrets_mode)
# Apply the same retention as scheduled backups if configured.
retention_row = await session.get(AppSetting, "backup_retention_count")
if retention_row and retention_row.value:
try:
retention = int(retention_row.value)
if retention > 0:
cleanup_old_backups(backup_dir, keep=retention)
except ValueError:
pass
stat = filepath.stat()
return {
"filename": filepath.name,
"size": stat.st_size,
"created_at": datetime.fromtimestamp(stat.st_mtime, tz=timezone.utc).isoformat(),
}
@router.get("/files/{filename}")
async def download_backup_file(
filename: str,
user: User = Depends(require_admin),
):
"""Download a specific backup file."""
filepath = _resolve_backup_file(filename)
try:
content = json.loads(filepath.read_text(encoding="utf-8"))
except Exception as e:
raise HTTPException(status_code=500, detail=f"Failed to read backup: {e}")
return JSONResponse(
content=content,
headers={"Content-Disposition": f'attachment; filename="{filename}"'},
)
@router.delete("/files/{filename}")
async def delete_backup_file(
filename: str,
user: User = Depends(require_admin),
):
"""Delete a specific backup file."""
filepath = _resolve_backup_file(filename)
filepath.unlink()
return {"deleted": filename}