feat(logging): production-grade logging with context vars, secret masking, and runtime level control

Boot-time logging was a three-line basicConfig stub with no timestamps, no correlation, and silent drops at every layer of the Telegram send path — a /random command that delivered text but no media left zero evidence in the log. This replaces the setup and closes every silent drop encountered end-to-end. New infrastructure: - notify_bridge_core.log_context: request_id/command/chat_id/bot_id/dispatch_id ContextVars with a bind_log_context() context manager so deep call sites (TelegramClient, NotificationDispatcher) inherit the correlation tag without threading args through. - notify_bridge_server.logging_setup: dictConfig-based setup with a LogRecordFactory that tags every record, a SecretMaskingFilter that redacts /botN:TOKEN plus Authorization/x-api-key/password/secret in messages AND tracebacks, a JSON formatter for aggregators, text formatter with grep-friendly [req=... cmd=... bot=... chat=... disp=...] prefix, and default dampening for sqlalchemy/aiohttp/apscheduler/urllib3/PIL. Runtime control: - NOTIFY_BRIDGE_LOG_LEVEL / _FORMAT / _LEVELS env vars (boot). - DB-backed log_level / log_format / log_levels AppSettings, applied on boot after migrations and live via apply_log_levels() when edited in the settings UI (format still requires restart, logs a WARN). - Frontend settings page gains a Logging card (level dropdown, format dropdown, per-module overrides); en/ru i18n keys added. Call-site fixes (/random media-group blind spot and adjacent): - TelegramClient._fetch_asset: every silent drop now WARN-logs with reason (missing url, HTTP non-200, size/dimension limits, ClientError). - TelegramClient._send_media_group: WARN on "chunk had N items but 0 usable", ERROR on sendMediaGroup non-ok/transport with full context; returns success=False + "no_items_delivered" instead of success=True with an empty message_ids list so callers can distinguish. - TelegramClient.send_message / _upload_media / _send_from_cache: ERROR on non-ok + transport failures with status/code/desc; DEBUG for cache-hit fallbacks. - NotificationDispatcher.dispatch: generates a dispatch_id, binds it, logs start/finish with failure count, uses exc_info for target failures. - commands/handler: missing/failed templates -> ERROR + exc_info; send_reply and send_media_group errors upgraded WARNING -> ERROR with chat/error_code context; rate-limit and truncation cases logged with full context. - commands/webhook and services/telegram_poller: bind_log_context(request_id =tg:<update_id>, command, chat_id, bot_id), INFO on receive/dispatch/ completion with duration, exc_info on raise, INFO when commands disabled. - commands/immich: INFO when album scope is empty; WARN per asset dropped from media payload and a summary WARN when "N assets in, 0 out".
2026-04-23 14:41:26 +03:00
parent 1f880daa0c
commit f50d465c0e
15 changed files with 831 additions and 63 deletions
@@ -0,0 +1,66 @@
+"""Request-scoped ContextVars that propagate into log records.
+
+The server sets these at entry points (Telegram webhook, scheduler dispatch,
+REST call) and they propagate through async calls automatically. A
+``LogRecordFactory`` installed by ``notify_bridge_server.logging_setup``
+reads them so every log line is tagged (``request_id``, ``command``,
+``chat_id``, ``bot_id``, ``dispatch_id``) without each call site having
+to pass the values explicitly.
+
+Kept in ``notify_bridge_core`` so core modules (``TelegramClient``,
+``NotificationDispatcher``) can *set* additional context (e.g. a
+``dispatch_id``) without depending on the server package.
+"""
+
+from __future__ import annotations
+
+from contextlib import contextmanager
+from contextvars import ContextVar, Token
+from typing import Any, Iterator
+
+request_id_var: ContextVar[str | None] = ContextVar("request_id", default=None)
+command_var: ContextVar[str | None] = ContextVar("command", default=None)
+chat_id_var: ContextVar[str | None] = ContextVar("chat_id", default=None)
+bot_id_var: ContextVar[int | None] = ContextVar("bot_id", default=None)
+dispatch_id_var: ContextVar[str | None] = ContextVar("dispatch_id", default=None)
+
+_VAR_MAP: dict[str, ContextVar[Any]] = {
+    "request_id": request_id_var,
+    "command": command_var,
+    "chat_id": chat_id_var,
+    "bot_id": bot_id_var,
+    "dispatch_id": dispatch_id_var,
+}
+
+
+@contextmanager
+def bind_log_context(**kwargs: Any) -> Iterator[None]:
+    """Bind the given context fields for the duration of the ``with`` block.
+
+    Unknown keys are ignored so callers can pass whatever they want without
+    an ``if`` ladder. Values are reset on exit even if the block raises.
+
+    Example:
+        ``with bind_log_context(request_id="abc", command="random"): ...``
+    """
+    tokens: list[tuple[ContextVar[Any], Token]] = []
+    try:
+        for key, value in kwargs.items():
+            var = _VAR_MAP.get(key)
+            if var is None:
+                continue
+            tokens.append((var, var.set(value)))
+        yield
+    finally:
+        for var, tok in tokens:
+            var.reset(tok)
+
+
+def current_log_context() -> dict[str, Any]:
+    """Return a snapshot of the currently-bound context values (non-None)."""
+    snap: dict[str, Any] = {}
+    for key, var in _VAR_MAP.items():
+        val = var.get()
+        if val is not None:
+            snap[key] = val
+    return snap
@@ -4,11 +4,13 @@ from __future__ import annotations

 import asyncio
 import logging
+import uuid
 from dataclasses import dataclass, field
 from typing import Any

 import aiohttp

+from notify_bridge_core.log_context import bind_log_context, dispatch_id_var
 from notify_bridge_core.models.events import ServiceEvent
 from notify_bridge_core.templates.context import build_template_context
 from notify_bridge_core.templates.renderer import render_template
@@ -95,18 +97,40 @@ class NotificationDispatcher:

        Returns list of results (one per target).
        """
-        raw_results = await asyncio.gather(
-            *[self._send_to_target(event, t) for t in targets],
-            return_exceptions=True,
-        )
-        results = []
-        for raw in raw_results:
-            if isinstance(raw, Exception):
-                _LOGGER.error("Failed to dispatch to target: %s", raw)
-                results.append({"success": False, "error": str(raw)})
-            else:
-                results.append(raw)
-        return results
+        # Bind a dispatch_id so every log line emitted by the target sends
+        # (including deep in TelegramClient) can be correlated to the same
+        # upstream event.
+        new_id = dispatch_id_var.get() or f"disp:{uuid.uuid4().hex[:12]}"
+
+        with bind_log_context(dispatch_id=new_id):
+            _LOGGER.info(
+                "Dispatching event %s (collection=%r) to %d target(s)",
+                event.event_type.value if hasattr(event.event_type, "value") else event.event_type,
+                getattr(event, "collection_name", None), len(targets),
+            )
+            raw_results = await asyncio.gather(
+                *[self._send_to_target(event, t) for t in targets],
+                return_exceptions=True,
+            )
+            results = []
+            failures = 0
+            for target, raw in zip(targets, raw_results):
+                if isinstance(raw, Exception):
+                    failures += 1
+                    _LOGGER.error(
+                        "Dispatch to target type=%s failed: %s",
+                        target.type, raw, exc_info=raw,
+                    )
+                    results.append({"success": False, "error": str(raw)})
+                else:
+                    if isinstance(raw, dict) and not raw.get("success"):
+                        failures += 1
+                    results.append(raw)
+            _LOGGER.info(
+                "Dispatch finished: %d target(s), %d failure(s)",
+                len(targets), failures,
+            )
+            return results

    def _resolve_template(
        self, event: ServiceEvent, target: TargetConfig, locale: str,
@@ -162,8 +162,20 @@ class TelegramClient:
                        "message_id": result.get("result", {}).get("message_id"),
                        "cached": True,
                    }
-        except aiohttp.ClientError:
-            pass
+                # Non-ok from a cached send — file_id stale or file deleted on
+                # Telegram's side. Log at DEBUG so operators who are hunting
+                # "why didn't the cached send work?" can see it, but the
+                # caller will fall through to a fresh upload.
+                _LOGGER.debug(
+                    "Telegram %s (cached) returned non-ok: status=%s code=%s desc=%r — falling back to fresh upload",
+                    kind.api_method, response.status, result.get("error_code"),
+                    result.get("description"),
+                )
+        except aiohttp.ClientError as err:
+            _LOGGER.debug(
+                "Telegram %s (cached) transport error — falling back to fresh upload: %s",
+                kind.api_method, err,
+            )
        return None

    async def _upload_media(
@@ -203,8 +215,17 @@ class TelegramClient:
                            thumbhash=thumbhash, size=len(data),
                        )
                    return {"success": True, "message_id": res.get("message_id")}
+                _LOGGER.error(
+                    "Telegram %s failed: status=%s code=%s desc=%r bytes=%d",
+                    kind.api_method, response.status, result.get("error_code"),
+                    result.get("description", "Unknown"), len(data),
+                )
                return {"success": False, "error": result.get("description", "Unknown Telegram error")}
        except aiohttp.ClientError as err:
+            _LOGGER.error(
+                "Telegram %s transport error (bytes=%d): %s",
+                kind.api_method, len(data), err, exc_info=True,
+            )
            return {"success": False, "error": str(err)}

    async def send_notification(
@@ -327,8 +348,14 @@ class TelegramClient:
                        retry_result = await retry_resp.json()
                        if retry_resp.status == 200 and retry_result.get("ok"):
                            return {"success": True, "message_id": retry_result.get("result", {}).get("message_id")}
+                _LOGGER.error(
+                    "Telegram sendMessage failed: status=%s code=%s desc=%r",
+                    response.status, result.get("error_code"),
+                    result.get("description", "Unknown"),
+                )
                return {"success": False, "error": result.get("description", "Unknown Telegram error"), "error_code": result.get("error_code")}
        except aiohttp.ClientError as err:
+            _LOGGER.error("Telegram sendMessage transport error: %s", err, exc_info=True)
            return {"success": False, "error": str(err)}

    async def send_chat_action(self, chat_id: str, action: str = "typing") -> bool:
@@ -513,11 +540,14 @@ class TelegramClient:
            # Tuple is (cache_key, media_type, thumbhash, uploaded_size).
            media_cache_info: list[tuple[str, str, str | None, int] | None] = []

-            # Resolve cache hits and collect download tasks in parallel
+            # Resolve cache hits and collect download tasks in parallel.
+            # Each drop site logs the reason — otherwise a filtered asset
+            # disappears silently and the media group silently shrinks.
            async def _fetch_asset(idx: int, item: dict) -> tuple[int, dict | None, bytes | None]:
                """Return (index, cache_entry_or_None, downloaded_bytes_or_None)."""
                url = item.get("url")
                if not url:
+                    _LOGGER.warning("Media skipped: missing url (idx=%d type=%s)", idx, item.get("type"))
                    return idx, None, None
                media_type = item.get("type", "photo")
                custom_cache_key = item.get("cache_key")
@@ -537,12 +567,24 @@ class TelegramClient:
                if preloaded is not None:
                    data = preloaded
                    if max_asset_data_size and len(data) > max_asset_data_size:
+                        _LOGGER.warning(
+                            "Media skipped: preloaded size %d exceeds max_asset_data_size %d (idx=%d type=%s url=%s)",
+                            len(data), max_asset_data_size, idx, media_type, url,
+                        )
                        return idx, None, None
                    if media_type == "video" and len(data) > TELEGRAM_MAX_VIDEO_SIZE:
+                        _LOGGER.warning(
+                            "Media skipped: preloaded video %d bytes exceeds Telegram limit %d (idx=%d url=%s)",
+                            len(data), TELEGRAM_MAX_VIDEO_SIZE, idx, url,
+                        )
                        return idx, None, None
                    if media_type == "photo":
-                        exceeds, _, _, _ = check_photo_limits(data)
+                        exceeds, reason, _, _ = check_photo_limits(data)
                        if exceeds:
+                            _LOGGER.warning(
+                                "Media skipped: preloaded photo %s (idx=%d url=%s)",
+                                reason, idx, url,
+                            )
                            return idx, None, None
                    return idx, None, data

@@ -551,18 +593,38 @@ class TelegramClient:
                    dl_headers = item.get("headers") or {}
                    async with self._session.get(download_url, headers=dl_headers) as resp:
                        if resp.status != 200:
+                            _LOGGER.warning(
+                                "Media skipped: download HTTP %d (idx=%d type=%s url=%s)",
+                                resp.status, idx, media_type, url,
+                            )
                            return idx, None, None
                        data = await resp.read()
                    if max_asset_data_size and len(data) > max_asset_data_size:
+                        _LOGGER.warning(
+                            "Media skipped: downloaded size %d exceeds max_asset_data_size %d (idx=%d type=%s url=%s)",
+                            len(data), max_asset_data_size, idx, media_type, url,
+                        )
                        return idx, None, None
                    if media_type == "video" and len(data) > TELEGRAM_MAX_VIDEO_SIZE:
+                        _LOGGER.warning(
+                            "Media skipped: video %d bytes exceeds Telegram %d-byte limit (idx=%d url=%s)",
+                            len(data), TELEGRAM_MAX_VIDEO_SIZE, idx, url,
+                        )
                        return idx, None, None
                    if media_type == "photo":
-                        exceeds, _, _, _ = check_photo_limits(data)
+                        exceeds, reason, _, _ = check_photo_limits(data)
                        if exceeds:
+                            _LOGGER.warning(
+                                "Media skipped: photo %s (idx=%d url=%s)",
+                                reason, idx, url,
+                            )
                            return idx, None, None
                    return idx, None, data
-                except aiohttp.ClientError:
+                except aiohttp.ClientError as err:
+                    _LOGGER.warning(
+                        "Media skipped: download failed (idx=%d type=%s url=%s): %s",
+                        idx, media_type, url, err,
+                    )
                    return idx, None, None

            results = await asyncio.gather(
@@ -602,6 +664,14 @@ class TelegramClient:
                media_json.append(mij)

            if not media_json:
+                # Every asset in this chunk was filtered out (size, download
+                # failure, etc.). Without this log, sendMediaGroup returns
+                # success=True with zero message_ids and nobody knows why
+                # the user sees only the text reply and no media.
+                _LOGGER.warning(
+                    "sendMediaGroup skipped — chunk %d/%d had %d input items but 0 usable (all filtered/failed)",
+                    chunk_idx + 1, len(chunks), len(chunk),
+                )
                continue

            form.add_field("media", json.dumps(media_json))
@@ -638,10 +708,35 @@ class TelegramClient:
                            if eff_cache:
                                await eff_cache.async_set_many(cache_entries)
                    else:
-                        return {"success": False, "error": result.get("description", "Unknown"), "failed_at_chunk": chunk_idx + 1}
+                        _LOGGER.error(
+                            "Telegram sendMediaGroup failed: status=%s code=%s desc=%r chunk=%d/%d items=%d",
+                            response.status, result.get("error_code"),
+                            result.get("description", "Unknown"),
+                            chunk_idx + 1, len(chunks), len(media_json),
+                        )
+                        return {
+                            "success": False,
+                            "error": result.get("description", "Unknown"),
+                            "error_code": result.get("error_code"),
+                            "failed_at_chunk": chunk_idx + 1,
+                        }
            except aiohttp.ClientError as err:
+                _LOGGER.error(
+                    "Telegram sendMediaGroup transport error on chunk %d/%d (%d items): %s",
+                    chunk_idx + 1, len(chunks), len(media_json), err,
+                    exc_info=True,
+                )
                return {"success": False, "error": str(err), "failed_at_chunk": chunk_idx + 1}

+        # Distinguish "posted something" from "posted nothing" so the caller
+        # can surface an ERROR when a command produced a caption reply but no
+        # media ever reached Telegram.
+        if not all_message_ids:
+            _LOGGER.warning(
+                "sendMediaGroup completed with 0 message_ids across %d chunk(s) — nothing was delivered",
+                len(chunks),
+            )
+            return {"success": False, "error": "no_items_delivered", "chunks_sent": len(chunks)}
        return {"success": True, "message_ids": all_message_ids, "chunks_sent": len(chunks)}

    # ------------------------------------------------------------------