Files
notify-bridge/packages/server/src/notify_bridge_server/main.py
T
alexei.dolgolyov 6a8f374678 feat: observability, per-receiver Telegram options, oversized-video fallback
Operability:
- Correlation IDs end-to-end: shared dispatch_id between log lines and
  EventLog rows (event/watcher/scheduled/deferred/action/HA/command paths)
  and a new X-Request-Id middleware that normalizes inbound ids and binds
  request_id into log context.
- dispatch_summary block merged into EventLog.details: per-target
  success/failure counts plus Telegram media delivered/skipped/failed and
  truncated error lists, so partial outcomes surface in the UI.
- Diagnostic mode: admin can flip one module to DEBUG for a bounded
  window with auto-revert (in-memory only; setup_logging() resets on
  boot, lifespan reverts on shutdown). New /diagnostic-mode endpoints
  plus DiagnosticsCassette UI on the settings page.

Telegram:
- Per-receiver options: disable_notification (silent send) and
  message_thread_id (forum-topic routing), wired through the dispatcher
  via a ContextVar so all four send sites (sendMessage / sendPhoto-Video-
  Document / sendMediaGroup / cache-hit POST) pick them up.
- send_large_videos_as_documents target setting: bypass the 50 MB
  sendVideo cap by falling back to sendDocument for oversized videos.
- sendMediaGroup byte-budget enforcement (TELEGRAM_MAX_GROUP_TOTAL_BYTES,
  45 MB) with per-item fallback on chunk failure so a stale file_id no
  longer silently drops a cached asset.

Tests:
- New: diagnostic_mode, dispatch_summary, request_correlation,
  telegram_media_group_partial, telegram_per_send_options.

Docs:
- .claude/reviews/: six-axis production-readiness review of v0.8.1.
- .claude/docs/functional-review-2026-05-28.md: focused review of
  Telegram/Immich/logging subsystems.
2026-05-28 15:19:31 +03:00

489 lines
20 KiB
Python

"""Notify Bridge Server — FastAPI application entry point."""
import logging
import uuid
from contextlib import asynccontextmanager
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from slowapi import _rate_limit_exceeded_handler
from slowapi.errors import RateLimitExceeded
from slowapi.middleware import SlowAPIMiddleware
from starlette.middleware.base import BaseHTTPMiddleware, RequestResponseEndpoint
from starlette.requests import Request as StarletteRequest
from starlette.responses import Response as StarletteResponse
from notify_bridge_core.log_context import bind_log_context
from .config import settings as _log_cfg
from .logging_setup import setup_logging
# Boot logging from env-based config. DB-backed AppSetting rows (``log_level`` /
# ``log_levels`` / ``log_format``) override this after migrations — see the
# lifespan block below.
setup_logging(
level="DEBUG" if _log_cfg.debug else _log_cfg.log_level,
fmt=_log_cfg.log_format,
per_module_levels=_log_cfg.log_levels,
)
_LOGGER = logging.getLogger(__name__)
from .database.engine import init_db
from .database.models import * # noqa: F401,F403 — ensure all models registered
from .auth.routes import router as auth_router
from .api.providers import router as providers_router
from .api.notification_trackers import router as notification_trackers_router
from .api.notification_tracker_targets import router as notification_tracker_targets_router
from .api.tracking_configs import router as tracking_configs_router
from .api.template_configs import router as template_configs_router
from .api.targets import router as targets_router
from .api.target_receivers import router as target_receivers_router
from .api.telegram_bots import router as telegram_bots_router
from .api.email_bots import router as email_bots_router
from .api.matrix_bots import router as matrix_bots_router
from .api.users import router as users_router
from .api.status import router as status_router
from .api.template_vars import router as template_vars_router
from .api.app_settings import router as app_settings_router
from .api.command_configs import router as command_configs_router
from .api.command_trackers import router as command_trackers_router
from .api.command_template_configs import router as command_template_configs_router
from .api.actions import router as actions_router
from .api.action_rules import router as action_rules_router
from .api.action_types import router as action_types_router
from .commands.webhook import router as webhook_router, set_webhook_secret
from .api.webhooks import router as webhooks_router
from .api.webhook_logs import router as webhook_logs_router
from .api.backup import router as backup_router
from .api.metrics import router as metrics_router
# Readiness flag — flipped to True once the scheduler has started and the
# app is fully initialized. Exposed via /api/ready for orchestrators.
_READY: bool = False
@asynccontextmanager
async def lifespan(app: FastAPI):
global _READY
await init_db()
# Run data migrations (idempotent)
from .database.engine import get_engine
from .database.migrations import (
migrate_schema,
migrate_tracker_targets,
migrate_entity_refactor,
migrate_template_slots,
migrate_target_receivers,
migrate_template_locale,
migrate_receivers_from_config,
migrate_command_slot_locale,
migrate_notification_slot_locale,
migrate_user_token_version,
migrate_performance_indexes,
migrate_chat_action_to_column,
migrate_deferred_dispatch_event_log_fk,
migrate_deferred_dispatch_unique_pending,
migrate_uniqueness_constraints,
migrate_eventlog_provider_fk,
migrate_schema_version,
)
from .database.snapshot import snapshot_and_prune
engine = get_engine()
# Take a consistent DB snapshot BEFORE migrations run, so operators can
# roll back a bad upgrade by restoring one file. Best-effort — failures
# are logged, not raised.
await snapshot_and_prune(
engine,
_log_cfg.data_dir / "backups",
keep=_log_cfg.pre_migrate_snapshot_keep,
)
await migrate_schema(engine)
await migrate_tracker_targets(engine)
await migrate_entity_refactor(engine)
await migrate_template_slots(engine)
await migrate_target_receivers(engine)
await migrate_template_locale(engine)
await migrate_receivers_from_config(engine)
await migrate_command_slot_locale(engine)
await migrate_notification_slot_locale(engine)
await migrate_user_token_version(engine)
await migrate_performance_indexes(engine)
await migrate_chat_action_to_column(engine)
# FK-rebuild MUST run before the unique-index creation: drop+create_all
# of deferred_dispatch wipes its indexes; the next migration re-establishes
# the partial unique index.
await migrate_deferred_dispatch_event_log_fk(engine)
await migrate_deferred_dispatch_unique_pending(engine)
# Backfill missing UNIQUE indexes on webhook hot paths (deduping any
# existing duplicates). Runs after performance_indexes so non-unique
# support indexes are already in place.
await migrate_uniqueness_constraints(engine)
# Document EventLog.provider_id FK strategy on existing tables (no-op
# on SQLite besides the log line; new tables get the FK from create_all).
await migrate_eventlog_provider_fk(engine)
await migrate_schema_version(engine)
from .database.seeds import seed_all
await seed_all()
# Apply DB-backed logging settings (override env-based boot config).
# log_format still needs a restart — changing it means swapping the
# handler formatter entirely.
try:
from sqlmodel.ext.asyncio.session import AsyncSession as _AS_log
from .api.app_settings import get_setting as _get_log_setting
from .logging_setup import apply_log_levels
async with _AS_log(engine) as _log_session:
db_level = await _get_log_setting(_log_session, "log_level")
db_levels = await _get_log_setting(_log_session, "log_levels")
apply_log_levels(level=db_level or None, per_module_levels=db_levels)
_LOGGER.info(
"Logging initialized: level=%s overrides=%r format=%s",
db_level or _log_cfg.log_level, db_levels or _log_cfg.log_levels,
_log_cfg.log_format,
)
except Exception: # pragma: no cover — never let logging setup abort boot
_LOGGER.exception("Failed to apply DB-backed log settings; keeping env-based levels")
# Apply any pending restore staged via /api/backup/prepare-restore
from .services.pending_restore import apply_pending_restore_if_any
await apply_pending_restore_if_any()
# Configure webhook secret from DB setting (falls back to env var)
from sqlmodel.ext.asyncio.session import AsyncSession as _AS
from .api.app_settings import get_setting as _get_setting
async with _AS(engine) as _session:
_secret = await _get_setting(_session, "telegram_webhook_secret")
set_webhook_secret(_secret or None)
from .services.scheduler import start_scheduler, get_scheduler
await start_scheduler()
# Phase 1 of the Home Assistant provider: subscription-based ingest runs
# outside the polling scheduler. ``start_all`` spawns one supervisor task
# per enabled HA provider row. No-op when no HA providers are configured.
from .services.ha_subscription import start_all as start_ha_subscriptions
await start_ha_subscriptions()
_READY = True
yield
# Graceful shutdown — cancel HA supervisors FIRST so they release their
# WS connections before the shared HTTP session is closed. Then stop the
# polling scheduler. Order matters: scheduler.shutdown(wait=True) drains
# in-flight jobs that may also use the shared session.
_READY = False
from .services.ha_subscription import stop_all as stop_ha_subscriptions
await stop_ha_subscriptions()
# Restore the DB-configured baseline level for any temporary DEBUG
# overrides before the engine is disposed — so even a forced restart
# leaves the world tidy and doesn't leak DEBUG state into the next
# process (which would also be wiped by setup_logging() at boot, but
# being explicit about shutdown is cheaper than relying on a re-init).
from .services.diagnostic_mode import revert_all as revert_diagnostics
try:
await revert_diagnostics()
except Exception: # pragma: no cover — never block shutdown on this.
_LOGGER.exception("Failed to revert diagnostic overrides during shutdown")
scheduler = get_scheduler()
if scheduler.running:
scheduler.shutdown(wait=True)
from .services.http_session import close_http_session
await close_http_session()
from .database.engine import dispose_engine
await dispose_engine()
from .version import resolve_version as _resolve_version
_APP_VERSION = _resolve_version()
app = FastAPI(title="Notify Bridge", version=_APP_VERSION, lifespan=lifespan)
# --- Security headers ---
# Bounded character set for accepted inbound X-Request-Id values. Anything
# outside this is replaced with a server-generated id so a malicious header
# can't smuggle CR/LF into log lines or break grep-by-field parsing.
# ``:`` is intentionally excluded so an inbound value can't masquerade as a
# server-minted ``disp:<hex>`` / ``req:<hex>`` id and confuse operator greps.
_REQUEST_ID_MAX_LEN = 64
_REQUEST_ID_ALLOWED = set(
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"
)
def _normalize_request_id(raw: str | None) -> str:
if not raw:
return f"req:{uuid.uuid4().hex[:12]}"
raw = raw.strip()
if not raw or len(raw) > _REQUEST_ID_MAX_LEN:
return f"req:{uuid.uuid4().hex[:12]}"
if not all(c in _REQUEST_ID_ALLOWED for c in raw):
return f"req:{uuid.uuid4().hex[:12]}"
return raw
class RequestContextMiddleware(BaseHTTPMiddleware):
"""Bind a per-request ``request_id`` ContextVar and echo it back.
Reads ``X-Request-Id`` from the inbound request (so an upstream proxy
with its own correlation system can propagate its id), falling back to
a short random ``req:<12 hex>`` value. Always sets the same id on the
response ``X-Request-Id`` header so the SPA can surface it for
operator-friendly bug reports.
Bound via :func:`bind_log_context` so the id appears on every log line
emitted during request handling (``[req=...]``) and is picked up by
:func:`notify_bridge_core.log_context.enrich_details_with_correlation`
when an ``EventLog`` row is written during the same request.
"""
async def dispatch(
self,
request: StarletteRequest,
call_next: RequestResponseEndpoint,
) -> StarletteResponse:
req_id = _normalize_request_id(request.headers.get("x-request-id"))
with bind_log_context(request_id=req_id):
response: StarletteResponse = await call_next(request)
response.headers["X-Request-Id"] = req_id
return response
_CSP = (
"default-src 'self'; "
"img-src 'self' data: blob: https:; "
"style-src 'self' 'unsafe-inline'; "
# SvelteKit's static adapter emits an inline bootstrap <script> with the
# hydration payload, so 'self' alone blocks the SPA from starting.
# 'unsafe-inline' re-enables it; the app's primary XSS protection still
# comes from Svelte's template auto-escaping and frontend/sanitize.ts
# for the few {@html} paths that render user-controlled content.
"script-src 'self' 'unsafe-inline'; "
"connect-src 'self'; "
"font-src 'self' data:; "
"base-uri 'self'; "
"form-action 'self'; "
"frame-ancestors 'none'"
)
class SecurityHeadersMiddleware(BaseHTTPMiddleware):
async def dispatch(self, request: StarletteRequest, call_next):
response: StarletteResponse = await call_next(request)
response.headers["X-Content-Type-Options"] = "nosniff"
response.headers["X-Frame-Options"] = "DENY"
response.headers["X-XSS-Protection"] = "1; mode=block"
response.headers["Referrer-Policy"] = "strict-origin-when-cross-origin"
response.headers.setdefault("Content-Security-Policy", _CSP)
# HSTS only makes sense over HTTPS; set when the edge terminates TLS
# and forwards X-Forwarded-Proto=https.
if request.headers.get("x-forwarded-proto") == "https":
response.headers.setdefault(
"Strict-Transport-Security",
"max-age=31536000; includeSubDomains",
)
return response
app.add_middleware(SecurityHeadersMiddleware)
# --- Rate limiting ---
from .auth.routes import limiter
app.state.limiter = limiter
app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler)
app.add_middleware(SlowAPIMiddleware)
# --- CORS ---
from .config import settings as _cfg
_origins = [o.strip() for o in _cfg.cors_allowed_origins.split(",") if o.strip()]
app.add_middleware(
CORSMiddleware,
allow_origins=_origins,
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Request-ID middleware is added LAST so it becomes the outermost wrapper —
# every other middleware (CORS, rate limit, security headers) then logs with
# the request_id already bound, and CORS preflight responses also carry the
# X-Request-Id echo header.
app.add_middleware(RequestContextMiddleware)
# Register routes — static paths before parameterized
app.include_router(auth_router)
app.include_router(template_vars_router)
app.include_router(providers_router)
app.include_router(notification_trackers_router)
app.include_router(notification_tracker_targets_router)
app.include_router(tracking_configs_router)
app.include_router(template_configs_router)
app.include_router(targets_router)
app.include_router(target_receivers_router)
app.include_router(telegram_bots_router)
app.include_router(email_bots_router)
app.include_router(matrix_bots_router)
app.include_router(users_router)
app.include_router(status_router)
app.include_router(app_settings_router)
app.include_router(action_types_router)
app.include_router(action_rules_router)
app.include_router(actions_router)
app.include_router(command_configs_router)
app.include_router(command_trackers_router)
app.include_router(command_template_configs_router)
app.include_router(webhook_router)
app.include_router(webhooks_router)
app.include_router(webhook_logs_router)
app.include_router(backup_router)
app.include_router(metrics_router)
@app.get("/api/health")
async def health():
"""Liveness: process is up and responding. Always returns 200 once the
ASGI app has started. Keep this endpoint anonymous and trivially cheap."""
return {"status": "ok", "version": _APP_VERSION}
@app.get("/api/ready")
async def ready():
"""Readiness: deep dependency check.
Verifies each critical dependency is actually reachable, not just that
the app finished its lifespan startup. Returns 503 if any *required*
check fails (db, scheduler). Home Assistant supervisor presence is
informational — a degraded HA does not flip readiness off.
Response shape:
{
"ready": bool,
"checks": {"db": "ok|fail", "scheduler": "ok|fail", "ha": "ok|degraded|na"},
"errors": [str, ...]
}
"""
from starlette.responses import JSONResponse
import asyncio as _asyncio
from sqlalchemy import text as _text
checks: dict[str, str] = {}
errors: list[str] = []
if not _READY:
# Lifespan still running — short-circuit so we don't poke a half-built engine.
return JSONResponse(
{
"ready": False,
"checks": {"db": "fail", "scheduler": "fail", "ha": "na"},
"errors": ["startup not complete"],
"version": _APP_VERSION,
},
status_code=503,
)
# --- DB: SELECT 1 with a 2s timeout ---
try:
from .database.engine import get_engine
engine = get_engine()
async def _ping_db() -> None:
async with engine.connect() as conn:
await conn.execute(_text("SELECT 1"))
await _asyncio.wait_for(_ping_db(), timeout=2.0)
checks["db"] = "ok"
except Exception as exc: # noqa: BLE001
checks["db"] = "fail"
errors.append(f"db: {exc!s}")
# --- Scheduler: APScheduler must be running ---
try:
from .services.scheduler import get_scheduler
scheduler = get_scheduler()
if scheduler.running:
checks["scheduler"] = "ok"
else:
checks["scheduler"] = "fail"
errors.append("scheduler: not running")
except Exception as exc: # noqa: BLE001
checks["scheduler"] = "fail"
errors.append(f"scheduler: {exc!s}")
# --- HA supervisor: informational only ---
# If no HA providers are configured, report "na" (not applicable). If any
# HA providers exist, ensure at least one supervisor task is alive — a
# task being not-yet-connected is fine, we just want it to exist.
try:
from sqlmodel import select as _select
from sqlmodel.ext.asyncio.session import AsyncSession as _AS
from .database.models import ServiceProvider
from .services.ha_subscription import _running_tasks as _ha_tasks
from .database.engine import get_engine as _get_engine_ha
async with _AS(_get_engine_ha()) as _session:
_result = await _session.exec(
_select(ServiceProvider).where(
ServiceProvider.type == "home_assistant",
)
)
ha_providers = _result.all()
if not ha_providers:
checks["ha"] = "na"
else:
alive = [
t for t in _ha_tasks.values() if t is not None and not t.done()
]
checks["ha"] = "ok" if alive else "degraded"
except Exception as exc: # noqa: BLE001
# Never let the HA probe fail readiness — it's informational.
checks["ha"] = "degraded"
errors.append(f"ha: {exc!s}")
required_ok = checks["db"] == "ok" and checks["scheduler"] == "ok"
body = {
"ready": required_ok,
"checks": checks,
"errors": errors,
"version": _APP_VERSION,
}
if not required_ok:
return JSONResponse(body, status_code=503)
return body
# --- Serve frontend static files (production) ---
# Must come AFTER all API routes so /api/* takes priority
from pathlib import Path
if _cfg.static_dir and Path(_cfg.static_dir).is_dir():
from fastapi.staticfiles import StaticFiles
from starlette.responses import FileResponse
from starlette.exceptions import HTTPException as StarletteHTTPException
_static_dir = Path(_cfg.static_dir)
class SPAStaticFiles(StaticFiles):
"""StaticFiles that falls back to index.html for SvelteKit client-side routes.
Unknown paths return index.html so that deep links like /settings
hydrate the SPA, while /api/* and real asset 404s behave normally.
"""
async def get_response(self, path: str, scope):
try:
return await super().get_response(path, scope)
except StarletteHTTPException as exc:
if exc.status_code == 404 and not path.startswith("api/"):
return FileResponse(_static_dir / "index.html")
raise
app.mount("/", SPAStaticFiles(directory=_cfg.static_dir, html=True), name="frontend")
def run():
import uvicorn
uvicorn.run(
app,
host=_cfg.host,
port=_cfg.port,
proxy_headers=True,
forwarded_allow_ips=_cfg.forwarded_allow_ips or "127.0.0.1",
timeout_graceful_shutdown=_cfg.graceful_shutdown_seconds,
access_log=not _cfg.debug,
)