feat(logging): production-grade logging with context vars, secret masking, and runtime level control

Boot-time logging was a three-line basicConfig stub with no timestamps, no
correlation, and silent drops at every layer of the Telegram send path — a
/random command that delivered text but no media left zero evidence in the
log. This replaces the setup and closes every silent drop encountered end-to-end.

New infrastructure:
- notify_bridge_core.log_context: request_id/command/chat_id/bot_id/dispatch_id
  ContextVars with a bind_log_context() context manager so deep call sites
  (TelegramClient, NotificationDispatcher) inherit the correlation tag without
  threading args through.
- notify_bridge_server.logging_setup: dictConfig-based setup with a
  LogRecordFactory that tags every record, a SecretMaskingFilter that redacts
  /botN:TOKEN plus Authorization/x-api-key/password/secret in messages AND
  tracebacks, a JSON formatter for aggregators, text formatter with grep-friendly
  [req=... cmd=... bot=... chat=... disp=...] prefix, and default dampening
  for sqlalchemy/aiohttp/apscheduler/urllib3/PIL.

Runtime control:
- NOTIFY_BRIDGE_LOG_LEVEL / _FORMAT / _LEVELS env vars (boot).
- DB-backed log_level / log_format / log_levels AppSettings, applied on
  boot after migrations and live via apply_log_levels() when edited in
  the settings UI (format still requires restart, logs a WARN).
- Frontend settings page gains a Logging card (level dropdown, format
  dropdown, per-module overrides); en/ru i18n keys added.

Call-site fixes (/random media-group blind spot and adjacent):
- TelegramClient._fetch_asset: every silent drop now WARN-logs with reason
  (missing url, HTTP non-200, size/dimension limits, ClientError).
- TelegramClient._send_media_group: WARN on "chunk had N items but 0 usable",
  ERROR on sendMediaGroup non-ok/transport with full context; returns
  success=False + "no_items_delivered" instead of success=True with an empty
  message_ids list so callers can distinguish.
- TelegramClient.send_message / _upload_media / _send_from_cache: ERROR on
  non-ok + transport failures with status/code/desc; DEBUG for cache-hit
  fallbacks.
- NotificationDispatcher.dispatch: generates a dispatch_id, binds it, logs
  start/finish with failure count, uses exc_info for target failures.
- commands/handler: missing/failed templates -> ERROR + exc_info; send_reply
  and send_media_group errors upgraded WARNING -> ERROR with chat/error_code
  context; rate-limit and truncation cases logged with full context.
- commands/webhook and services/telegram_poller: bind_log_context(request_id
  =tg:<update_id>, command, chat_id, bot_id), INFO on receive/dispatch/
  completion with duration, exc_info on raise, INFO when commands disabled.
- commands/immich: INFO when album scope is empty; WARN per asset dropped
  from media payload and a summary WARN when "N assets in, 0 out".
This commit is contained in:
2026-04-23 14:41:26 +03:00
parent 1f880daa0c
commit f50d465c0e
15 changed files with 831 additions and 63 deletions
@@ -0,0 +1,66 @@
"""Request-scoped ContextVars that propagate into log records.
The server sets these at entry points (Telegram webhook, scheduler dispatch,
REST call) and they propagate through async calls automatically. A
``LogRecordFactory`` installed by ``notify_bridge_server.logging_setup``
reads them so every log line is tagged (``request_id``, ``command``,
``chat_id``, ``bot_id``, ``dispatch_id``) without each call site having
to pass the values explicitly.
Kept in ``notify_bridge_core`` so core modules (``TelegramClient``,
``NotificationDispatcher``) can *set* additional context (e.g. a
``dispatch_id``) without depending on the server package.
"""
from __future__ import annotations
from contextlib import contextmanager
from contextvars import ContextVar, Token
from typing import Any, Iterator
request_id_var: ContextVar[str | None] = ContextVar("request_id", default=None)
command_var: ContextVar[str | None] = ContextVar("command", default=None)
chat_id_var: ContextVar[str | None] = ContextVar("chat_id", default=None)
bot_id_var: ContextVar[int | None] = ContextVar("bot_id", default=None)
dispatch_id_var: ContextVar[str | None] = ContextVar("dispatch_id", default=None)
_VAR_MAP: dict[str, ContextVar[Any]] = {
"request_id": request_id_var,
"command": command_var,
"chat_id": chat_id_var,
"bot_id": bot_id_var,
"dispatch_id": dispatch_id_var,
}
@contextmanager
def bind_log_context(**kwargs: Any) -> Iterator[None]:
"""Bind the given context fields for the duration of the ``with`` block.
Unknown keys are ignored so callers can pass whatever they want without
an ``if`` ladder. Values are reset on exit even if the block raises.
Example:
``with bind_log_context(request_id="abc", command="random"): ...``
"""
tokens: list[tuple[ContextVar[Any], Token]] = []
try:
for key, value in kwargs.items():
var = _VAR_MAP.get(key)
if var is None:
continue
tokens.append((var, var.set(value)))
yield
finally:
for var, tok in tokens:
var.reset(tok)
def current_log_context() -> dict[str, Any]:
"""Return a snapshot of the currently-bound context values (non-None)."""
snap: dict[str, Any] = {}
for key, var in _VAR_MAP.items():
val = var.get()
if val is not None:
snap[key] = val
return snap
@@ -4,11 +4,13 @@ from __future__ import annotations
import asyncio
import logging
import uuid
from dataclasses import dataclass, field
from typing import Any
import aiohttp
from notify_bridge_core.log_context import bind_log_context, dispatch_id_var
from notify_bridge_core.models.events import ServiceEvent
from notify_bridge_core.templates.context import build_template_context
from notify_bridge_core.templates.renderer import render_template
@@ -95,18 +97,40 @@ class NotificationDispatcher:
Returns list of results (one per target).
"""
raw_results = await asyncio.gather(
*[self._send_to_target(event, t) for t in targets],
return_exceptions=True,
)
results = []
for raw in raw_results:
if isinstance(raw, Exception):
_LOGGER.error("Failed to dispatch to target: %s", raw)
results.append({"success": False, "error": str(raw)})
else:
results.append(raw)
return results
# Bind a dispatch_id so every log line emitted by the target sends
# (including deep in TelegramClient) can be correlated to the same
# upstream event.
new_id = dispatch_id_var.get() or f"disp:{uuid.uuid4().hex[:12]}"
with bind_log_context(dispatch_id=new_id):
_LOGGER.info(
"Dispatching event %s (collection=%r) to %d target(s)",
event.event_type.value if hasattr(event.event_type, "value") else event.event_type,
getattr(event, "collection_name", None), len(targets),
)
raw_results = await asyncio.gather(
*[self._send_to_target(event, t) for t in targets],
return_exceptions=True,
)
results = []
failures = 0
for target, raw in zip(targets, raw_results):
if isinstance(raw, Exception):
failures += 1
_LOGGER.error(
"Dispatch to target type=%s failed: %s",
target.type, raw, exc_info=raw,
)
results.append({"success": False, "error": str(raw)})
else:
if isinstance(raw, dict) and not raw.get("success"):
failures += 1
results.append(raw)
_LOGGER.info(
"Dispatch finished: %d target(s), %d failure(s)",
len(targets), failures,
)
return results
def _resolve_template(
self, event: ServiceEvent, target: TargetConfig, locale: str,
@@ -162,8 +162,20 @@ class TelegramClient:
"message_id": result.get("result", {}).get("message_id"),
"cached": True,
}
except aiohttp.ClientError:
pass
# Non-ok from a cached send — file_id stale or file deleted on
# Telegram's side. Log at DEBUG so operators who are hunting
# "why didn't the cached send work?" can see it, but the
# caller will fall through to a fresh upload.
_LOGGER.debug(
"Telegram %s (cached) returned non-ok: status=%s code=%s desc=%r — falling back to fresh upload",
kind.api_method, response.status, result.get("error_code"),
result.get("description"),
)
except aiohttp.ClientError as err:
_LOGGER.debug(
"Telegram %s (cached) transport error — falling back to fresh upload: %s",
kind.api_method, err,
)
return None
async def _upload_media(
@@ -203,8 +215,17 @@ class TelegramClient:
thumbhash=thumbhash, size=len(data),
)
return {"success": True, "message_id": res.get("message_id")}
_LOGGER.error(
"Telegram %s failed: status=%s code=%s desc=%r bytes=%d",
kind.api_method, response.status, result.get("error_code"),
result.get("description", "Unknown"), len(data),
)
return {"success": False, "error": result.get("description", "Unknown Telegram error")}
except aiohttp.ClientError as err:
_LOGGER.error(
"Telegram %s transport error (bytes=%d): %s",
kind.api_method, len(data), err, exc_info=True,
)
return {"success": False, "error": str(err)}
async def send_notification(
@@ -327,8 +348,14 @@ class TelegramClient:
retry_result = await retry_resp.json()
if retry_resp.status == 200 and retry_result.get("ok"):
return {"success": True, "message_id": retry_result.get("result", {}).get("message_id")}
_LOGGER.error(
"Telegram sendMessage failed: status=%s code=%s desc=%r",
response.status, result.get("error_code"),
result.get("description", "Unknown"),
)
return {"success": False, "error": result.get("description", "Unknown Telegram error"), "error_code": result.get("error_code")}
except aiohttp.ClientError as err:
_LOGGER.error("Telegram sendMessage transport error: %s", err, exc_info=True)
return {"success": False, "error": str(err)}
async def send_chat_action(self, chat_id: str, action: str = "typing") -> bool:
@@ -513,11 +540,14 @@ class TelegramClient:
# Tuple is (cache_key, media_type, thumbhash, uploaded_size).
media_cache_info: list[tuple[str, str, str | None, int] | None] = []
# Resolve cache hits and collect download tasks in parallel
# Resolve cache hits and collect download tasks in parallel.
# Each drop site logs the reason — otherwise a filtered asset
# disappears silently and the media group silently shrinks.
async def _fetch_asset(idx: int, item: dict) -> tuple[int, dict | None, bytes | None]:
"""Return (index, cache_entry_or_None, downloaded_bytes_or_None)."""
url = item.get("url")
if not url:
_LOGGER.warning("Media skipped: missing url (idx=%d type=%s)", idx, item.get("type"))
return idx, None, None
media_type = item.get("type", "photo")
custom_cache_key = item.get("cache_key")
@@ -537,12 +567,24 @@ class TelegramClient:
if preloaded is not None:
data = preloaded
if max_asset_data_size and len(data) > max_asset_data_size:
_LOGGER.warning(
"Media skipped: preloaded size %d exceeds max_asset_data_size %d (idx=%d type=%s url=%s)",
len(data), max_asset_data_size, idx, media_type, url,
)
return idx, None, None
if media_type == "video" and len(data) > TELEGRAM_MAX_VIDEO_SIZE:
_LOGGER.warning(
"Media skipped: preloaded video %d bytes exceeds Telegram limit %d (idx=%d url=%s)",
len(data), TELEGRAM_MAX_VIDEO_SIZE, idx, url,
)
return idx, None, None
if media_type == "photo":
exceeds, _, _, _ = check_photo_limits(data)
exceeds, reason, _, _ = check_photo_limits(data)
if exceeds:
_LOGGER.warning(
"Media skipped: preloaded photo %s (idx=%d url=%s)",
reason, idx, url,
)
return idx, None, None
return idx, None, data
@@ -551,18 +593,38 @@ class TelegramClient:
dl_headers = item.get("headers") or {}
async with self._session.get(download_url, headers=dl_headers) as resp:
if resp.status != 200:
_LOGGER.warning(
"Media skipped: download HTTP %d (idx=%d type=%s url=%s)",
resp.status, idx, media_type, url,
)
return idx, None, None
data = await resp.read()
if max_asset_data_size and len(data) > max_asset_data_size:
_LOGGER.warning(
"Media skipped: downloaded size %d exceeds max_asset_data_size %d (idx=%d type=%s url=%s)",
len(data), max_asset_data_size, idx, media_type, url,
)
return idx, None, None
if media_type == "video" and len(data) > TELEGRAM_MAX_VIDEO_SIZE:
_LOGGER.warning(
"Media skipped: video %d bytes exceeds Telegram %d-byte limit (idx=%d url=%s)",
len(data), TELEGRAM_MAX_VIDEO_SIZE, idx, url,
)
return idx, None, None
if media_type == "photo":
exceeds, _, _, _ = check_photo_limits(data)
exceeds, reason, _, _ = check_photo_limits(data)
if exceeds:
_LOGGER.warning(
"Media skipped: photo %s (idx=%d url=%s)",
reason, idx, url,
)
return idx, None, None
return idx, None, data
except aiohttp.ClientError:
except aiohttp.ClientError as err:
_LOGGER.warning(
"Media skipped: download failed (idx=%d type=%s url=%s): %s",
idx, media_type, url, err,
)
return idx, None, None
results = await asyncio.gather(
@@ -602,6 +664,14 @@ class TelegramClient:
media_json.append(mij)
if not media_json:
# Every asset in this chunk was filtered out (size, download
# failure, etc.). Without this log, sendMediaGroup returns
# success=True with zero message_ids and nobody knows why
# the user sees only the text reply and no media.
_LOGGER.warning(
"sendMediaGroup skipped — chunk %d/%d had %d input items but 0 usable (all filtered/failed)",
chunk_idx + 1, len(chunks), len(chunk),
)
continue
form.add_field("media", json.dumps(media_json))
@@ -638,10 +708,35 @@ class TelegramClient:
if eff_cache:
await eff_cache.async_set_many(cache_entries)
else:
return {"success": False, "error": result.get("description", "Unknown"), "failed_at_chunk": chunk_idx + 1}
_LOGGER.error(
"Telegram sendMediaGroup failed: status=%s code=%s desc=%r chunk=%d/%d items=%d",
response.status, result.get("error_code"),
result.get("description", "Unknown"),
chunk_idx + 1, len(chunks), len(media_json),
)
return {
"success": False,
"error": result.get("description", "Unknown"),
"error_code": result.get("error_code"),
"failed_at_chunk": chunk_idx + 1,
}
except aiohttp.ClientError as err:
_LOGGER.error(
"Telegram sendMediaGroup transport error on chunk %d/%d (%d items): %s",
chunk_idx + 1, len(chunks), len(media_json), err,
exc_info=True,
)
return {"success": False, "error": str(err), "failed_at_chunk": chunk_idx + 1}
# Distinguish "posted something" from "posted nothing" so the caller
# can surface an ERROR when a command produced a caption reply but no
# media ever reached Telegram.
if not all_message_ids:
_LOGGER.warning(
"sendMediaGroup completed with 0 message_ids across %d chunk(s) — nothing was delivered",
len(chunks),
)
return {"success": False, "error": "no_items_delivered", "chunks_sent": len(chunks)}
return {"success": True, "message_ids": all_message_ids, "chunks_sent": len(chunks)}
# ------------------------------------------------------------------