feat(logging): production-grade logging with context vars, secret masking, and runtime level control
Boot-time logging was a three-line basicConfig stub with no timestamps, no correlation, and silent drops at every layer of the Telegram send path — a /random command that delivered text but no media left zero evidence in the log. This replaces the setup and closes every silent drop encountered end-to-end. New infrastructure: - notify_bridge_core.log_context: request_id/command/chat_id/bot_id/dispatch_id ContextVars with a bind_log_context() context manager so deep call sites (TelegramClient, NotificationDispatcher) inherit the correlation tag without threading args through. - notify_bridge_server.logging_setup: dictConfig-based setup with a LogRecordFactory that tags every record, a SecretMaskingFilter that redacts /botN:TOKEN plus Authorization/x-api-key/password/secret in messages AND tracebacks, a JSON formatter for aggregators, text formatter with grep-friendly [req=... cmd=... bot=... chat=... disp=...] prefix, and default dampening for sqlalchemy/aiohttp/apscheduler/urllib3/PIL. Runtime control: - NOTIFY_BRIDGE_LOG_LEVEL / _FORMAT / _LEVELS env vars (boot). - DB-backed log_level / log_format / log_levels AppSettings, applied on boot after migrations and live via apply_log_levels() when edited in the settings UI (format still requires restart, logs a WARN). - Frontend settings page gains a Logging card (level dropdown, format dropdown, per-module overrides); en/ru i18n keys added. Call-site fixes (/random media-group blind spot and adjacent): - TelegramClient._fetch_asset: every silent drop now WARN-logs with reason (missing url, HTTP non-200, size/dimension limits, ClientError). - TelegramClient._send_media_group: WARN on "chunk had N items but 0 usable", ERROR on sendMediaGroup non-ok/transport with full context; returns success=False + "no_items_delivered" instead of success=True with an empty message_ids list so callers can distinguish. - TelegramClient.send_message / _upload_media / _send_from_cache: ERROR on non-ok + transport failures with status/code/desc; DEBUG for cache-hit fallbacks. - NotificationDispatcher.dispatch: generates a dispatch_id, binds it, logs start/finish with failure count, uses exc_info for target failures. - commands/handler: missing/failed templates -> ERROR + exc_info; send_reply and send_media_group errors upgraded WARNING -> ERROR with chat/error_code context; rate-limit and truncation cases logged with full context. - commands/webhook and services/telegram_poller: bind_log_context(request_id =tg:<update_id>, command, chat_id, bot_id), INFO on receive/dispatch/ completion with duration, exc_info on raise, INFO when commands disabled. - commands/immich: INFO when album scope is empty; WARN per asset dropped from media payload and a summary WARN when "N assets in, 0 out".
This commit is contained in:
@@ -108,13 +108,18 @@ def _render_cmd_template(
|
||||
"""Render a locale-aware command template. Falls back to 'en'."""
|
||||
template_str = _resolve_template(templates, slot_name, locale)
|
||||
if not template_str:
|
||||
_LOGGER.warning("No command template found for slot '%s' locale '%s'", slot_name, locale)
|
||||
# Missing template = user sees "[No template: X]" — this is an ERROR,
|
||||
# not a warning. Broken replies must stand out in production logs.
|
||||
_LOGGER.error("No command template found for slot '%s' locale '%s'", slot_name, locale)
|
||||
return f"[No template: {slot_name}]"
|
||||
try:
|
||||
tmpl = _compile_template(template_str)
|
||||
return tmpl.render(**context)
|
||||
except Exception as e:
|
||||
_LOGGER.warning("Failed to render command template '%s': %s", slot_name, e)
|
||||
except Exception:
|
||||
_LOGGER.error(
|
||||
"Failed to render command template '%s' locale=%s — user will see a broken reply",
|
||||
slot_name, locale, exc_info=True,
|
||||
)
|
||||
return f"[Template error: {slot_name}]"
|
||||
|
||||
|
||||
@@ -296,6 +301,10 @@ async def handle_command(
|
||||
# Rate limit check (once per command, shared across all trackers)
|
||||
wait = _check_rate_limit(bot.id, chat_id, cmd, rate_limits)
|
||||
if wait is not None:
|
||||
_LOGGER.info(
|
||||
"Rate-limited /%s for bot=%d chat=%s — %ds cooldown remaining",
|
||||
cmd, bot.id, chat_id, wait,
|
||||
)
|
||||
text_resp = _render_cmd_template(merged_templates, "rate_limited", locale, {"wait": wait})
|
||||
return [CommandResponse(text=text_resp)]
|
||||
|
||||
@@ -322,8 +331,8 @@ async def handle_command(
|
||||
for tracker, config, provider, listener in ctx_tuples:
|
||||
if len(responses) >= _MAX_RESPONSES_PER_COMMAND:
|
||||
_LOGGER.warning(
|
||||
"Truncated command responses at %d for bot %d cmd /%s",
|
||||
_MAX_RESPONSES_PER_COMMAND, bot.id, cmd,
|
||||
"Truncated command responses at %d for bot=%d chat=%s cmd=/%s (listener context size=%d)",
|
||||
_MAX_RESPONSES_PER_COMMAND, bot.id, chat_id, cmd, len(ctx_tuples),
|
||||
)
|
||||
break
|
||||
|
||||
@@ -418,7 +427,12 @@ async def send_reply(
|
||||
disable_web_page_preview=True,
|
||||
)
|
||||
if not result.get("success"):
|
||||
_LOGGER.warning("Telegram reply failed: %s", result.get("error"))
|
||||
# User-visible failure: the bot's reply never reached the chat.
|
||||
_LOGGER.error(
|
||||
"Telegram reply failed (chat=%s reply_to=%s len=%d): code=%s error=%r",
|
||||
chat_id, reply_to_message_id, len(text or ""),
|
||||
result.get("error_code"), result.get("error"),
|
||||
)
|
||||
|
||||
|
||||
async def send_media_group(
|
||||
@@ -442,6 +456,14 @@ async def send_media_group(
|
||||
assets hit the cache and skip the re-upload.
|
||||
"""
|
||||
if not media_items:
|
||||
# This is what happened in the /random blind spot: the text reply
|
||||
# was sent, but the media follow-up was silently skipped because
|
||||
# the caller passed an empty media list. Surface it so we can see
|
||||
# it in the log and correlate with the text message.
|
||||
_LOGGER.warning(
|
||||
"send_media_group called with 0 items (chat=%s reply_to=%s) — no media will be delivered",
|
||||
chat_id, reply_to_message_id,
|
||||
)
|
||||
return
|
||||
|
||||
from ..services.telegram_send import send_telegram_media
|
||||
@@ -452,7 +474,13 @@ async def send_media_group(
|
||||
chat_action=None,
|
||||
)
|
||||
if not result.get("success"):
|
||||
_LOGGER.warning("Telegram media group failed: %s", result.get("error"))
|
||||
# User-visible failure: media promised by the text reply never arrived.
|
||||
_LOGGER.error(
|
||||
"Telegram media group failed (chat=%s items=%d reply_to=%s): code=%s error=%r failed_at_chunk=%s",
|
||||
chat_id, len(media_items), reply_to_message_id,
|
||||
result.get("error_code"), result.get("error"),
|
||||
result.get("failed_at_chunk"),
|
||||
)
|
||||
|
||||
|
||||
async def register_commands_with_telegram(bot: TelegramBot) -> bool:
|
||||
|
||||
Reference in New Issue
Block a user