notify-bridge/packages/server/src/notify_bridge_server/main.py

"""Notify Bridge Server — FastAPI application entry point."""

import logging
import uuid
from contextlib import asynccontextmanager

from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from slowapi import _rate_limit_exceeded_handler
from slowapi.errors import RateLimitExceeded
from slowapi.middleware import SlowAPIMiddleware
from starlette.middleware.base import BaseHTTPMiddleware, RequestResponseEndpoint
from starlette.requests import Request as StarletteRequest
from starlette.responses import Response as StarletteResponse

from notify_bridge_core.log_context import bind_log_context

from .config import settings as _log_cfg
from .logging_setup import setup_logging

# Boot logging from env-based config. DB-backed AppSetting rows (``log_level`` /
# ``log_levels`` / ``log_format``) override this after migrations — see the
# lifespan block below.
setup_logging(
    level="DEBUG" if _log_cfg.debug else _log_cfg.log_level,
    fmt=_log_cfg.log_format,
    per_module_levels=_log_cfg.log_levels,
)
_LOGGER = logging.getLogger(__name__)

from .database.engine import init_db
from .database.models import *  # noqa: F401,F403 — ensure all models registered

from .auth.routes import router as auth_router
from .api.providers import router as providers_router
from .api.notification_trackers import router as notification_trackers_router
from .api.notification_tracker_targets import router as notification_tracker_targets_router
from .api.tracking_configs import router as tracking_configs_router
from .api.template_configs import router as template_configs_router
from .api.targets import router as targets_router
from .api.target_receivers import router as target_receivers_router
from .api.telegram_bots import router as telegram_bots_router
from .api.email_bots import router as email_bots_router
from .api.matrix_bots import router as matrix_bots_router
from .api.users import router as users_router
from .api.status import router as status_router
from .api.template_vars import router as template_vars_router
from .api.app_settings import router as app_settings_router
from .api.command_configs import router as command_configs_router
from .api.command_trackers import router as command_trackers_router
from .api.command_template_configs import router as command_template_configs_router
from .api.actions import router as actions_router
from .api.action_rules import router as action_rules_router
from .api.action_types import router as action_types_router
from .commands.webhook import router as webhook_router, set_webhook_secret
from .api.webhooks import router as webhooks_router
from .api.webhook_logs import router as webhook_logs_router
from .api.backup import router as backup_router
from .api.metrics import router as metrics_router


# Readiness flag — flipped to True once the scheduler has started and the
# app is fully initialized. Exposed via /api/ready for orchestrators.
_READY: bool = False


@asynccontextmanager
async def lifespan(app: FastAPI):
    global _READY
    await init_db()
    # Run data migrations (idempotent)
    from .database.engine import get_engine
    from .database.migrations import (
        migrate_schema,
        migrate_tracker_targets,
        migrate_entity_refactor,
        migrate_template_slots,
        migrate_target_receivers,
        migrate_template_locale,
        migrate_receivers_from_config,
        migrate_command_slot_locale,
        migrate_notification_slot_locale,
        migrate_user_token_version,
        migrate_performance_indexes,
        migrate_chat_action_to_column,
        migrate_deferred_dispatch_event_log_fk,
        migrate_deferred_dispatch_unique_pending,
        migrate_uniqueness_constraints,
        migrate_eventlog_provider_fk,
        migrate_schema_version,
    )
    from .database.snapshot import snapshot_and_prune
    engine = get_engine()
    # Take a consistent DB snapshot BEFORE migrations run, so operators can
    # roll back a bad upgrade by restoring one file. Best-effort — failures
    # are logged, not raised.
    await snapshot_and_prune(
        engine,
        _log_cfg.data_dir / "backups",
        keep=_log_cfg.pre_migrate_snapshot_keep,
    )
    await migrate_schema(engine)
    await migrate_tracker_targets(engine)
    await migrate_entity_refactor(engine)
    await migrate_template_slots(engine)
    await migrate_target_receivers(engine)
    await migrate_template_locale(engine)
    await migrate_receivers_from_config(engine)
    await migrate_command_slot_locale(engine)
    await migrate_notification_slot_locale(engine)
    await migrate_user_token_version(engine)
    await migrate_performance_indexes(engine)
    await migrate_chat_action_to_column(engine)
    # FK-rebuild MUST run before the unique-index creation: drop+create_all
    # of deferred_dispatch wipes its indexes; the next migration re-establishes
    # the partial unique index.
    await migrate_deferred_dispatch_event_log_fk(engine)
    await migrate_deferred_dispatch_unique_pending(engine)
    # Backfill missing UNIQUE indexes on webhook hot paths (deduping any
    # existing duplicates). Runs after performance_indexes so non-unique
    # support indexes are already in place.
    await migrate_uniqueness_constraints(engine)
    # Document EventLog.provider_id FK strategy on existing tables (no-op
    # on SQLite besides the log line; new tables get the FK from create_all).
    await migrate_eventlog_provider_fk(engine)
    await migrate_schema_version(engine)
    from .database.seeds import seed_all
    await seed_all()
    # Apply DB-backed logging settings (override env-based boot config).
    # log_format still needs a restart — changing it means swapping the
    # handler formatter entirely.
    try:
        from sqlmodel.ext.asyncio.session import AsyncSession as _AS_log
        from .api.app_settings import get_setting as _get_log_setting
        from .logging_setup import apply_log_levels
        async with _AS_log(engine) as _log_session:
            db_level = await _get_log_setting(_log_session, "log_level")
            db_levels = await _get_log_setting(_log_session, "log_levels")
        apply_log_levels(level=db_level or None, per_module_levels=db_levels)
        _LOGGER.info(
            "Logging initialized: level=%s overrides=%r format=%s",
            db_level or _log_cfg.log_level, db_levels or _log_cfg.log_levels,
            _log_cfg.log_format,
        )
    except Exception:  # pragma: no cover — never let logging setup abort boot
        _LOGGER.exception("Failed to apply DB-backed log settings; keeping env-based levels")
    # Apply any pending restore staged via /api/backup/prepare-restore
    from .services.pending_restore import apply_pending_restore_if_any
    await apply_pending_restore_if_any()
    # Configure webhook secret from DB setting (falls back to env var)
    from sqlmodel.ext.asyncio.session import AsyncSession as _AS
    from .api.app_settings import get_setting as _get_setting
    async with _AS(engine) as _session:
        _secret = await _get_setting(_session, "telegram_webhook_secret")
    set_webhook_secret(_secret or None)
    from .services.scheduler import start_scheduler, get_scheduler
    await start_scheduler()
    # Phase 1 of the Home Assistant provider: subscription-based ingest runs
    # outside the polling scheduler. ``start_all`` spawns one supervisor task
    # per enabled HA provider row. No-op when no HA providers are configured.
    from .services.ha_subscription import start_all as start_ha_subscriptions
    await start_ha_subscriptions()
    _READY = True
    yield
    # Graceful shutdown — cancel HA supervisors FIRST so they release their
    # WS connections before the shared HTTP session is closed. Then stop the
    # polling scheduler. Order matters: scheduler.shutdown(wait=True) drains
    # in-flight jobs that may also use the shared session.
    _READY = False
    from .services.ha_subscription import stop_all as stop_ha_subscriptions
    await stop_ha_subscriptions()
    # Restore the DB-configured baseline level for any temporary DEBUG
    # overrides before the engine is disposed — so even a forced restart
    # leaves the world tidy and doesn't leak DEBUG state into the next
    # process (which would also be wiped by setup_logging() at boot, but
    # being explicit about shutdown is cheaper than relying on a re-init).
    from .services.diagnostic_mode import revert_all as revert_diagnostics
    try:
        await revert_diagnostics()
    except Exception:  # pragma: no cover — never block shutdown on this.
        _LOGGER.exception("Failed to revert diagnostic overrides during shutdown")
    scheduler = get_scheduler()
    if scheduler.running:
        scheduler.shutdown(wait=True)
    from .services.http_session import close_http_session
    await close_http_session()
    from .database.engine import dispose_engine
    await dispose_engine()


from .version import resolve_version as _resolve_version
_APP_VERSION = _resolve_version()

app = FastAPI(title="Notify Bridge", version=_APP_VERSION, lifespan=lifespan)

# --- Security headers ---


# Bounded character set for accepted inbound X-Request-Id values. Anything
# outside this is replaced with a server-generated id so a malicious header
# can't smuggle CR/LF into log lines or break grep-by-field parsing.
# ``:`` is intentionally excluded so an inbound value can't masquerade as a
# server-minted ``disp:<hex>`` / ``req:<hex>`` id and confuse operator greps.
_REQUEST_ID_MAX_LEN = 64
_REQUEST_ID_ALLOWED = set(
    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"
)


def _normalize_request_id(raw: str | None) -> str:
    if not raw:
        return f"req:{uuid.uuid4().hex[:12]}"
    raw = raw.strip()
    if not raw or len(raw) > _REQUEST_ID_MAX_LEN:
        return f"req:{uuid.uuid4().hex[:12]}"
    if not all(c in _REQUEST_ID_ALLOWED for c in raw):
        return f"req:{uuid.uuid4().hex[:12]}"
    return raw


class RequestContextMiddleware(BaseHTTPMiddleware):
    """Bind a per-request ``request_id`` ContextVar and echo it back.

    Reads ``X-Request-Id`` from the inbound request (so an upstream proxy
    with its own correlation system can propagate its id), falling back to
    a short random ``req:<12 hex>`` value. Always sets the same id on the
    response ``X-Request-Id`` header so the SPA can surface it for
    operator-friendly bug reports.

    Bound via :func:`bind_log_context` so the id appears on every log line
    emitted during request handling (``[req=...]``) and is picked up by
    :func:`notify_bridge_core.log_context.enrich_details_with_correlation`
    when an ``EventLog`` row is written during the same request.
    """

    async def dispatch(
        self,
        request: StarletteRequest,
        call_next: RequestResponseEndpoint,
    ) -> StarletteResponse:
        req_id = _normalize_request_id(request.headers.get("x-request-id"))
        with bind_log_context(request_id=req_id):
            response: StarletteResponse = await call_next(request)
        response.headers["X-Request-Id"] = req_id
        return response


_CSP = (
    "default-src 'self'; "
    "img-src 'self' data: blob: https:; "
    "style-src 'self' 'unsafe-inline'; "
    # SvelteKit's static adapter emits an inline bootstrap <script> with the
    # hydration payload, so 'self' alone blocks the SPA from starting.
    # 'unsafe-inline' re-enables it; the app's primary XSS protection still
    # comes from Svelte's template auto-escaping and frontend/sanitize.ts
    # for the few {@html} paths that render user-controlled content.
    "script-src 'self' 'unsafe-inline'; "
    "connect-src 'self'; "
    "font-src 'self' data:; "
    "base-uri 'self'; "
    "form-action 'self'; "
    "frame-ancestors 'none'"
)


class SecurityHeadersMiddleware(BaseHTTPMiddleware):
    async def dispatch(self, request: StarletteRequest, call_next):
        response: StarletteResponse = await call_next(request)
        response.headers["X-Content-Type-Options"] = "nosniff"
        response.headers["X-Frame-Options"] = "DENY"
        response.headers["X-XSS-Protection"] = "1; mode=block"
        response.headers["Referrer-Policy"] = "strict-origin-when-cross-origin"
        response.headers.setdefault("Content-Security-Policy", _CSP)
        # HSTS only makes sense over HTTPS; set when the edge terminates TLS
        # and forwards X-Forwarded-Proto=https.
        if request.headers.get("x-forwarded-proto") == "https":
            response.headers.setdefault(
                "Strict-Transport-Security",
                "max-age=31536000; includeSubDomains",
            )
        return response


app.add_middleware(SecurityHeadersMiddleware)

# --- Rate limiting ---
from .auth.routes import limiter
app.state.limiter = limiter
app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler)
app.add_middleware(SlowAPIMiddleware)

# --- CORS ---
from .config import settings as _cfg
_origins = [o.strip() for o in _cfg.cors_allowed_origins.split(",") if o.strip()]
app.add_middleware(
    CORSMiddleware,
    allow_origins=_origins,
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# Request-ID middleware is added LAST so it becomes the outermost wrapper —
# every other middleware (CORS, rate limit, security headers) then logs with
# the request_id already bound, and CORS preflight responses also carry the
# X-Request-Id echo header.
app.add_middleware(RequestContextMiddleware)

# Register routes — static paths before parameterized
app.include_router(auth_router)
app.include_router(template_vars_router)
app.include_router(providers_router)
app.include_router(notification_trackers_router)
app.include_router(notification_tracker_targets_router)
app.include_router(tracking_configs_router)
app.include_router(template_configs_router)
app.include_router(targets_router)
app.include_router(target_receivers_router)
app.include_router(telegram_bots_router)
app.include_router(email_bots_router)
app.include_router(matrix_bots_router)
app.include_router(users_router)
app.include_router(status_router)
app.include_router(app_settings_router)
app.include_router(action_types_router)
app.include_router(action_rules_router)
app.include_router(actions_router)
app.include_router(command_configs_router)
app.include_router(command_trackers_router)
app.include_router(command_template_configs_router)
app.include_router(webhook_router)
app.include_router(webhooks_router)
app.include_router(webhook_logs_router)
app.include_router(backup_router)
app.include_router(metrics_router)


@app.get("/api/health")
async def health():
    """Liveness: process is up and responding. Always returns 200 once the
    ASGI app has started. Keep this endpoint anonymous and trivially cheap."""
    return {"status": "ok", "version": _APP_VERSION}


@app.get("/api/ready")
async def ready():
    """Readiness: deep dependency check.

    Verifies each critical dependency is actually reachable, not just that
    the app finished its lifespan startup. Returns 503 if any *required*
    check fails (db, scheduler). Home Assistant supervisor presence is
    informational — a degraded HA does not flip readiness off.

    Response shape:
        {
          "ready": bool,
          "checks": {"db": "ok|fail", "scheduler": "ok|fail", "ha": "ok|degraded|na"},
          "errors": [str, ...]
        }
    """
    from starlette.responses import JSONResponse
    import asyncio as _asyncio
    from sqlalchemy import text as _text

    checks: dict[str, str] = {}
    errors: list[str] = []

    if not _READY:
        # Lifespan still running — short-circuit so we don't poke a half-built engine.
        return JSONResponse(
            {
                "ready": False,
                "checks": {"db": "fail", "scheduler": "fail", "ha": "na"},
                "errors": ["startup not complete"],
                "version": _APP_VERSION,
            },
            status_code=503,
        )

    # --- DB: SELECT 1 with a 2s timeout ---
    try:
        from .database.engine import get_engine
        engine = get_engine()

        async def _ping_db() -> None:
            async with engine.connect() as conn:
                await conn.execute(_text("SELECT 1"))

        await _asyncio.wait_for(_ping_db(), timeout=2.0)
        checks["db"] = "ok"
    except Exception as exc:  # noqa: BLE001
        checks["db"] = "fail"
        errors.append(f"db: {exc!s}")

    # --- Scheduler: APScheduler must be running ---
    try:
        from .services.scheduler import get_scheduler
        scheduler = get_scheduler()
        if scheduler.running:
            checks["scheduler"] = "ok"
        else:
            checks["scheduler"] = "fail"
            errors.append("scheduler: not running")
    except Exception as exc:  # noqa: BLE001
        checks["scheduler"] = "fail"
        errors.append(f"scheduler: {exc!s}")

    # --- HA supervisor: informational only ---
    # If no HA providers are configured, report "na" (not applicable). If any
    # HA providers exist, ensure at least one supervisor task is alive — a
    # task being not-yet-connected is fine, we just want it to exist.
    try:
        from sqlmodel import select as _select
        from sqlmodel.ext.asyncio.session import AsyncSession as _AS
        from .database.models import ServiceProvider
        from .services.ha_subscription import _running_tasks as _ha_tasks

        from .database.engine import get_engine as _get_engine_ha
        async with _AS(_get_engine_ha()) as _session:
            _result = await _session.exec(
                _select(ServiceProvider).where(
                    ServiceProvider.type == "home_assistant",
                )
            )
            ha_providers = _result.all()
        if not ha_providers:
            checks["ha"] = "na"
        else:
            alive = [
                t for t in _ha_tasks.values() if t is not None and not t.done()
            ]
            checks["ha"] = "ok" if alive else "degraded"
    except Exception as exc:  # noqa: BLE001
        # Never let the HA probe fail readiness — it's informational.
        checks["ha"] = "degraded"
        errors.append(f"ha: {exc!s}")

    required_ok = checks["db"] == "ok" and checks["scheduler"] == "ok"
    body = {
        "ready": required_ok,
        "checks": checks,
        "errors": errors,
        "version": _APP_VERSION,
    }
    if not required_ok:
        return JSONResponse(body, status_code=503)
    return body


# --- Serve frontend static files (production) ---
# Must come AFTER all API routes so /api/* takes priority
from pathlib import Path
if _cfg.static_dir and Path(_cfg.static_dir).is_dir():
    from fastapi.staticfiles import StaticFiles
    from starlette.responses import FileResponse
    from starlette.exceptions import HTTPException as StarletteHTTPException

    _static_dir = Path(_cfg.static_dir)

    class SPAStaticFiles(StaticFiles):
        """StaticFiles that falls back to index.html for SvelteKit client-side routes.

        Unknown paths return index.html so that deep links like /settings
        hydrate the SPA, while /api/* and real asset 404s behave normally.
        """

        async def get_response(self, path: str, scope):
            try:
                return await super().get_response(path, scope)
            except StarletteHTTPException as exc:
                if exc.status_code == 404 and not path.startswith("api/"):
                    return FileResponse(_static_dir / "index.html")
                raise

    app.mount("/", SPAStaticFiles(directory=_cfg.static_dir, html=True), name="frontend")


def run():
    import uvicorn
    uvicorn.run(
        app,
        host=_cfg.host,
        port=_cfg.port,
        proxy_headers=True,
        forwarded_allow_ips=_cfg.forwarded_allow_ips or "127.0.0.1",
        timeout_graceful_shutdown=_cfg.graceful_shutdown_seconds,
        access_log=not _cfg.debug,
    )