feat: production-readiness hardening across security, async, DB, ops
Build and Test / test-frontend (push) Successful in 9m37s
Build and Test / test-backend (push) Successful in 10m53s
Build and Test / build-image (push) Failing after 14m52s

Security
- SSRF: async DNS resolver; allow_redirects=False on all outbound clients;
  matrix homeserver_url validated on create/update/test; update_provider
  and email_bot merge incoming config and reject ***-masked secrets.
- Auth: bcrypt offloaded to asyncio.to_thread; JWT now carries iss/aud +
  leeway and rejects missing claims; setup TOCTOU closed inside a
  transaction; rate limits extended (default 600/min, 10/min on password
  change, 30/min on needs-setup); constant-time login to prevent username
  enumeration.
- Config: rejects known dev secret keys; validates CORS origin schemes,
  port range, token lifetimes.
- Webhook handlers stream-read body with a 1 MiB cap; Discord 429 retries
  bounded (3 attempts, Retry-After capped at 60 s).
- CSP + HSTS added to SecurityHeadersMiddleware.

Async / runtime
- SQLite engine: WAL, synchronous=NORMAL, foreign_keys=ON, busy_timeout,
  pool_pre_ping, dispose on shutdown.
- Lifespan shutdown now stops scheduler before closing HTTP session and
  disposing the engine.
- Shared aiohttp session locked against concurrent first-caller races;
  core NotificationDispatcher accepts and reuses it.
- Storage and scheduled backup writes wrapped in asyncio.to_thread.
- NUT client writes bounded by asyncio.wait_for.
- Telegram poller switched from 3 s short-poll to 30 s interval + 25 s
  long-poll (~10x fewer API calls).

Database
- New performance-indexes migration covers every FK/owner column and
  hot-path composite (notification_tracker(provider_id, enabled);
  event_log(user_id, created_at DESC); webhook_payload_log(provider_id,
  created_at DESC); action_execution(action_id, started_at DESC)).
- New schema_version table for future upgrade gating.
- __system__ placeholder user (id=0) seeded so user_id=0 system defaults
  satisfy the newly enforced FK; filtered out of /auth/needs-setup,
  /api/users, and setup.
- list_notification_trackers rewritten to batched loads (was 1+N+N*M).
- Retention job extended to event_log, webhook_payload_log, and
  action_execution; retention days exposed as a setting.

Scheduler
- AsyncIOScheduler job_defaults: coalesce, misfire_grace_time=300,
  max_instances=1.

Ops
- uvicorn runs with proxy_headers, forwarded_allow_ips,
  timeout_graceful_shutdown; access log suppressed in non-debug.
- FastAPI version string now reads from importlib.metadata.
- New /api/ready endpoint separate from /api/health.
- docker-compose drops the ALLOW_PRIVATE_URLS=1 default, adds mem/cpu/pid
  limits, read_only + tmpfs, cap_drop:ALL, no-new-privileges; healthcheck
  targets /api/ready.
- CI now runs on push/PR with backend pytest, frontend svelte-check +
  build, and a non-push image build; release workflow gated on tests,
  publishes immutable sha-<commit> image tag, adds Trivy scan.

Tests
- New packages/server/tests/ with 29 passing tests: config validation,
  JWT round-trip + aud/alg=none rejection, SSRF scheme and private-range
  enforcement (sync + async), Discord bounded retry, and a lifespan-level
  /api/health + /api/ready smoke check.
- Renamed the misnamed services/test_dispatch.py to manual_dispatch.py so
  pytest never auto-collects production code.

Frontend
- /login now redirects already-authenticated users to /, shows a distinct
  'backend unreachable' banner (en/ru) when /auth/needs-setup fails.
This commit is contained in:
2026-04-23 19:44:56 +03:00
parent f50d465c0e
commit 920920bc67
44 changed files with 1426 additions and 186 deletions
@@ -52,22 +52,46 @@ class DiscordClient:
return {"success": True}
_MAX_RETRIES = 3
_MAX_RETRY_AFTER = 60.0
async def _post(self, url: str, payload: dict) -> dict[str, Any]:
try:
async with self._session.post(
url, json=payload, headers={"Content-Type": "application/json"}
) as resp:
if resp.status == 429:
retry_after = float(resp.headers.get("Retry-After", "2"))
_LOGGER.warning("Discord rate limited, retrying after %.1fs", retry_after)
await asyncio.sleep(retry_after)
return await self._post(url, payload)
if 200 <= resp.status < 300:
return {"success": True}
body = await resp.text()
return {"success": False, "error": f"HTTP {resp.status}: {body[:200]}"}
except aiohttp.ClientError as e:
return {"success": False, "error": str(e)}
"""POST with bounded 429 retry.
We cap retries at _MAX_RETRIES and the ``Retry-After`` header at
_MAX_RETRY_AFTER seconds so a hostile or misbehaving upstream cannot
pin the dispatch task indefinitely.
"""
for attempt in range(self._MAX_RETRIES + 1):
try:
async with self._session.post(
url,
json=payload,
headers={"Content-Type": "application/json"},
allow_redirects=False,
) as resp:
if resp.status == 429 and attempt < self._MAX_RETRIES:
try:
retry_after = float(resp.headers.get("Retry-After", "2"))
except (TypeError, ValueError):
retry_after = 2.0
retry_after = max(0.0, min(retry_after, self._MAX_RETRY_AFTER))
_LOGGER.warning(
"Discord rate limited, retrying after %.1fs (attempt %d/%d)",
retry_after, attempt + 1, self._MAX_RETRIES,
)
await asyncio.sleep(retry_after)
continue
if 200 <= resp.status < 300:
return {"success": True}
body = await resp.text()
return {
"success": False,
"error": f"HTTP {resp.status}: {body[:200]}",
}
except aiohttp.ClientError as e:
return {"success": False, "error": str(e)}
return {"success": False, "error": "Rate limited (retries exhausted)"}
def _split_message(text: str, limit: int) -> list[str]:
@@ -3,10 +3,11 @@
from __future__ import annotations
import asyncio
import contextlib
import logging
import uuid
from dataclasses import dataclass, field
from typing import Any
from typing import Any, AsyncIterator
import aiohttp
@@ -14,7 +15,7 @@ from notify_bridge_core.log_context import bind_log_context, dispatch_id_var
from notify_bridge_core.models.events import ServiceEvent
from notify_bridge_core.templates.context import build_template_context
from notify_bridge_core.templates.renderer import render_template
from .ssrf import UnsafeURLError, validate_outbound_url
from .ssrf import UnsafeURLError, avalidate_outbound_url
_HTTP_TIMEOUT = aiohttp.ClientTimeout(total=30)
@@ -84,9 +85,28 @@ class NotificationDispatcher:
*,
url_cache: TelegramFileCache | None = None,
asset_cache: TelegramFileCache | None = None,
session: aiohttp.ClientSession | None = None,
) -> None:
self._url_cache = url_cache
self._asset_cache = asset_cache
# Optional shared session owned by the caller; when supplied we reuse
# its connection pool instead of opening a fresh per-dispatch session
# (saves a TLS handshake per outbound call).
self._shared_session = session
@contextlib.asynccontextmanager
async def _session_ctx(self) -> AsyncIterator[aiohttp.ClientSession]:
"""Yield an aiohttp session, reusing the shared one if provided.
When a shared session was passed in ``__init__`` we yield it without
closing (the caller owns its lifetime). Otherwise we open a
short-lived session with our default timeout and close it on exit.
"""
if self._shared_session is not None and not self._shared_session.closed:
yield self._shared_session
return
async with self._session_ctx() as session:
yield session
async def dispatch(
self,
@@ -308,7 +328,7 @@ class NotificationDispatcher:
media_assets.append(asset)
results: list[dict[str, Any]] = []
async with _new_session() as session:
async with self._session_ctx() as session:
# Preload all asset bytes once so (a) TelegramClient can skip its
# own download and (b) we know exact upload sizes in time for the
# oversize warning in the rendered text.
@@ -378,13 +398,13 @@ class NotificationDispatcher:
return {"success": False, "error": "No receivers configured"}
results: list[dict[str, Any]] = []
async with _new_session() as session:
async with self._session_ctx() as session:
for receiver in target.receivers:
if not isinstance(receiver, WebhookReceiver) or not receiver.url:
results.append({"success": False, "error": "Invalid webhook receiver"})
continue
try:
validate_outbound_url(receiver.url)
await avalidate_outbound_url(receiver.url)
except UnsafeURLError as err:
results.append({"success": False, "error": f"Unsafe URL: {err}"})
continue
@@ -452,14 +472,14 @@ class NotificationDispatcher:
username = target.config.get("username")
results: list[dict[str, Any]] = []
async with _new_session() as session:
async with self._session_ctx() as session:
client = DiscordClient(session)
for receiver in target.receivers:
if not isinstance(receiver, DiscordReceiver) or not receiver.webhook_url:
results.append({"success": False, "error": "Invalid discord receiver"})
continue
try:
validate_outbound_url(receiver.webhook_url)
await avalidate_outbound_url(receiver.webhook_url)
except UnsafeURLError as err:
results.append({"success": False, "error": f"Unsafe URL: {err}"})
continue
@@ -478,14 +498,14 @@ class NotificationDispatcher:
username = target.config.get("username")
results: list[dict[str, Any]] = []
async with _new_session() as session:
async with self._session_ctx() as session:
client = SlackClient(session)
for receiver in target.receivers:
if not isinstance(receiver, SlackReceiver) or not receiver.webhook_url:
results.append({"success": False, "error": "Invalid slack receiver"})
continue
try:
validate_outbound_url(receiver.webhook_url)
await avalidate_outbound_url(receiver.webhook_url)
except UnsafeURLError as err:
results.append({"success": False, "error": f"Unsafe URL: {err}"})
continue
@@ -504,14 +524,14 @@ class NotificationDispatcher:
if not target.receivers:
return {"success": False, "error": "No receivers configured"}
try:
validate_outbound_url(server_url)
await avalidate_outbound_url(server_url)
except UnsafeURLError as err:
return {"success": False, "error": f"Unsafe ntfy server_url: {err}"}
title = f"{event.event_type.value}: {event.collection_name}"
results: list[dict[str, Any]] = []
async with _new_session() as session:
async with self._session_ctx() as session:
client = NtfyClient(session)
for receiver in target.receivers:
if not isinstance(receiver, NtfyReceiver) or not receiver.topic:
@@ -535,7 +555,7 @@ class NotificationDispatcher:
if not homeserver or not access_token:
return {"success": False, "error": "Missing Matrix homeserver_url or access_token"}
try:
validate_outbound_url(homeserver)
await avalidate_outbound_url(homeserver)
except UnsafeURLError as err:
return {"success": False, "error": f"Unsafe matrix homeserver_url: {err}"}
@@ -543,7 +563,7 @@ class NotificationDispatcher:
return {"success": False, "error": "No receivers configured"}
results: list[dict[str, Any]] = []
async with _new_session() as session:
async with self._session_ctx() as session:
client = MatrixClient(session, homeserver, access_token)
for receiver in target.receivers:
if not isinstance(receiver, MatrixReceiver) or not receiver.room_id:
@@ -68,7 +68,9 @@ class MatrixClient:
}
try:
async with self._session.put(url, json=body, headers=headers) as resp:
async with self._session.put(
url, json=body, headers=headers, allow_redirects=False,
) as resp:
if 200 <= resp.status < 300:
return {"success": True}
resp_body = await resp.text()
@@ -51,7 +51,9 @@ class NtfyClient:
headers["Authorization"] = f"Bearer {auth_token}"
try:
async with self._session.post(url, json=payload, headers=headers) as resp:
async with self._session.post(
url, json=payload, headers=headers, allow_redirects=False,
) as resp:
if 200 <= resp.status < 300:
return {"success": True}
body = await resp.text()
@@ -38,6 +38,7 @@ class SlackClient:
webhook_url,
json=payload,
headers={"Content-Type": "application/json"},
allow_redirects=False,
) as resp:
if resp.status == 429:
_LOGGER.warning("Slack rate limited")
@@ -12,14 +12,25 @@ development against localhost services.
from __future__ import annotations
import asyncio
import ipaddress
import logging
import os
import socket
from urllib.parse import urlparse
_LOGGER = logging.getLogger(__name__)
_ALLOW_PRIVATE = os.environ.get("NOTIFY_BRIDGE_ALLOW_PRIVATE_URLS") == "1"
_ALLOWED_SCHEMES = {"http", "https"}
if _ALLOW_PRIVATE: # pragma: no cover — operator-visible banner
_LOGGER.warning(
"SSRF guard: private-URL bypass ENABLED "
"(NOTIFY_BRIDGE_ALLOW_PRIVATE_URLS=1). Requests to RFC1918 / "
"loopback / link-local hosts will be permitted."
)
class UnsafeURLError(ValueError):
"""Raised when a URL targets a disallowed network destination."""
@@ -36,13 +47,7 @@ def _is_blocked_ip(ip: ipaddress.IPv4Address | ipaddress.IPv6Address) -> bool:
)
def validate_outbound_url(url: str) -> str:
"""Validate ``url`` is safe to fetch; returns the URL on success.
Raises :class:`UnsafeURLError` when the scheme, host, or resolved IP
is not allowed. In development (``NOTIFY_BRIDGE_ALLOW_PRIVATE_URLS=1``)
private addresses are permitted but the scheme check still applies.
"""
def _check_scheme_host(url: str) -> tuple[str, str]:
if not isinstance(url, str) or not url:
raise UnsafeURLError("URL is empty")
parsed = urlparse(url)
@@ -51,6 +56,31 @@ def validate_outbound_url(url: str) -> str:
host = parsed.hostname
if not host:
raise UnsafeURLError("URL has no host")
return parsed.scheme, host
def _check_resolved_addresses(host: str, infos: list[tuple]) -> None:
for info in infos:
sockaddr = info[4]
try:
ip = ipaddress.ip_address(sockaddr[0])
except ValueError:
continue
if _is_blocked_ip(ip):
raise UnsafeURLError(f"Host {host} resolves to blocked address {ip}")
def validate_outbound_url(url: str) -> str:
"""Validate ``url`` is safe to fetch; returns the URL on success.
Raises :class:`UnsafeURLError` when the scheme, host, or resolved IP
is not allowed. In development (``NOTIFY_BRIDGE_ALLOW_PRIVATE_URLS=1``)
private addresses are permitted but the scheme check still applies.
Synchronous; uses blocking ``socket.getaddrinfo``. Prefer
:func:`avalidate_outbound_url` from async code paths.
"""
_, host = _check_scheme_host(url)
if _ALLOW_PRIVATE:
return url
@@ -64,17 +94,37 @@ def validate_outbound_url(url: str) -> str:
except ValueError:
pass
# Hostname — resolve and reject if any resolution is in a blocked range.
try:
infos = socket.getaddrinfo(host, None)
except socket.gaierror as exc:
raise UnsafeURLError(f"DNS resolution failed for {host}") from exc
for info in infos:
sockaddr = info[4]
try:
ip = ipaddress.ip_address(sockaddr[0])
except ValueError:
continue
if _is_blocked_ip(ip):
raise UnsafeURLError(f"Host {host} resolves to blocked address {ip}")
_check_resolved_addresses(host, infos)
return url
async def avalidate_outbound_url(url: str) -> str:
"""Async variant that resolves DNS via the running loop's resolver.
Use this from ``async def`` code paths to avoid blocking the event
loop on DNS lookups.
"""
_, host = _check_scheme_host(url)
if _ALLOW_PRIVATE:
return url
try:
ip = ipaddress.ip_address(host)
if _is_blocked_ip(ip):
raise UnsafeURLError(f"Host {host} is in a blocked range")
return url
except ValueError:
pass
loop = asyncio.get_running_loop()
try:
infos = await loop.getaddrinfo(host, None)
except socket.gaierror as exc:
raise UnsafeURLError(f"DNS resolution failed for {host}") from exc
_check_resolved_addresses(host, infos)
return url
@@ -7,7 +7,7 @@ from typing import Any
import aiohttp
from ..ssrf import UnsafeURLError, validate_outbound_url
from ..ssrf import UnsafeURLError, avalidate_outbound_url
_LOGGER = logging.getLogger(__name__)
@@ -24,7 +24,7 @@ class WebhookClient:
async def send(self, payload: dict[str, Any]) -> dict[str, Any]:
try:
validate_outbound_url(self._url)
await avalidate_outbound_url(self._url)
except UnsafeURLError as err:
return {"success": False, "error": f"Unsafe URL: {err}"}
try:
@@ -33,6 +33,7 @@ class WebhookClient:
json=payload,
headers={"Content-Type": "application/json", **self._headers},
timeout=_DEFAULT_TIMEOUT,
allow_redirects=False,
) as response:
if 200 <= response.status < 300:
return {"success": True, "status_code": response.status}
@@ -12,6 +12,7 @@ _LOGGER = logging.getLogger(__name__)
_DEFAULT_PORT = 3493
_READ_TIMEOUT = 10.0
_WRITE_TIMEOUT = 10.0
_CONNECT_TIMEOUT = 5.0
# Allowed characters for NUT protocol identifiers (UPS names, variable names).
@@ -84,14 +85,26 @@ class NutClient:
await self._command(f"PASSWORD {self._password}")
async def disconnect(self) -> None:
"""Send LOGOUT and close the TCP connection."""
"""Send LOGOUT and close the TCP connection.
``drain`` is bounded by ``_WRITE_TIMEOUT`` so a half-closed peer
cannot hold the disconnect indefinitely — a tracker tick would
otherwise be pinned by a stuck NUT server and block the scheduler
slot (``max_instances=1``).
"""
if self._writer is not None:
try:
self._writer.write(b"LOGOUT\n")
await self._writer.drain()
except OSError:
await asyncio.wait_for(self._writer.drain(), timeout=_WRITE_TIMEOUT)
except (OSError, asyncio.TimeoutError):
pass
self._writer.close()
try:
await asyncio.wait_for(
self._writer.wait_closed(), timeout=_WRITE_TIMEOUT,
)
except (OSError, asyncio.TimeoutError):
pass
self._reader = None
self._writer = None
@@ -135,7 +148,10 @@ class NutClient:
if self._writer is None:
raise NutClientError("Not connected")
self._writer.write(f"{cmd}\n".encode())
await self._writer.drain()
try:
await asyncio.wait_for(self._writer.drain(), timeout=_WRITE_TIMEOUT)
except asyncio.TimeoutError as exc:
raise NutClientError("Write timeout") from exc
async def _readline(self) -> str:
"""Read one line from upsd, stripping trailing newline."""
+37 -11
View File
@@ -2,8 +2,10 @@
from __future__ import annotations
import asyncio
import json
import logging
import os
from pathlib import Path
from typing import Any, Protocol, runtime_checkable
@@ -19,34 +21,58 @@ class StorageBackend(Protocol):
async def remove(self) -> None: ...
def _read_file(path: Path) -> str | None:
if not path.exists():
return None
return path.read_text(encoding="utf-8")
def _atomic_write(path: Path, payload: str) -> None:
"""Write atomically: tmp file + rename. Prevents half-written files on crash."""
path.parent.mkdir(parents=True, exist_ok=True)
tmp = path.with_suffix(path.suffix + ".tmp")
tmp.write_text(payload, encoding="utf-8")
os.replace(tmp, path)
def _remove_file(path: Path) -> None:
if path.exists():
path.unlink()
class JsonFileBackend:
"""Simple JSON file storage backend."""
"""Simple JSON file storage backend.
All blocking I/O is wrapped in ``asyncio.to_thread`` so callers can
``await load() / save() / remove()`` without stalling the event loop.
"""
def __init__(self, path: Path) -> None:
self._path = path
async def load(self) -> dict[str, Any] | None:
if not self._path.exists():
try:
text = await asyncio.to_thread(_read_file, self._path)
except OSError as err:
_LOGGER.warning("Failed to load %s: %s", self._path, err)
return None
if text is None:
return None
try:
text = self._path.read_text(encoding="utf-8")
return json.loads(text)
except (json.JSONDecodeError, OSError) as err:
_LOGGER.warning("Failed to load %s: %s", self._path, err)
except json.JSONDecodeError as err:
_LOGGER.warning("Failed to parse %s: %s", self._path, err)
return None
async def save(self, data: dict[str, Any]) -> None:
payload = json.dumps(data, default=str)
try:
self._path.parent.mkdir(parents=True, exist_ok=True)
self._path.write_text(
json.dumps(data, default=str), encoding="utf-8"
)
await asyncio.to_thread(_atomic_write, self._path, payload)
except OSError as err:
_LOGGER.error("Failed to save %s: %s", self._path, err)
async def remove(self) -> None:
try:
if self._path.exists():
self._path.unlink()
await asyncio.to_thread(_remove_file, self._path)
except OSError as err:
_LOGGER.error("Failed to remove %s: %s", self._path, err)