feat(logging): production-grade logging with context vars, secret masking, and runtime level control
Boot-time logging was a three-line basicConfig stub with no timestamps, no correlation, and silent drops at every layer of the Telegram send path — a /random command that delivered text but no media left zero evidence in the log. This replaces the setup and closes every silent drop encountered end-to-end. New infrastructure: - notify_bridge_core.log_context: request_id/command/chat_id/bot_id/dispatch_id ContextVars with a bind_log_context() context manager so deep call sites (TelegramClient, NotificationDispatcher) inherit the correlation tag without threading args through. - notify_bridge_server.logging_setup: dictConfig-based setup with a LogRecordFactory that tags every record, a SecretMaskingFilter that redacts /botN:TOKEN plus Authorization/x-api-key/password/secret in messages AND tracebacks, a JSON formatter for aggregators, text formatter with grep-friendly [req=... cmd=... bot=... chat=... disp=...] prefix, and default dampening for sqlalchemy/aiohttp/apscheduler/urllib3/PIL. Runtime control: - NOTIFY_BRIDGE_LOG_LEVEL / _FORMAT / _LEVELS env vars (boot). - DB-backed log_level / log_format / log_levels AppSettings, applied on boot after migrations and live via apply_log_levels() when edited in the settings UI (format still requires restart, logs a WARN). - Frontend settings page gains a Logging card (level dropdown, format dropdown, per-module overrides); en/ru i18n keys added. Call-site fixes (/random media-group blind spot and adjacent): - TelegramClient._fetch_asset: every silent drop now WARN-logs with reason (missing url, HTTP non-200, size/dimension limits, ClientError). - TelegramClient._send_media_group: WARN on "chunk had N items but 0 usable", ERROR on sendMediaGroup non-ok/transport with full context; returns success=False + "no_items_delivered" instead of success=True with an empty message_ids list so callers can distinguish. - TelegramClient.send_message / _upload_media / _send_from_cache: ERROR on non-ok + transport failures with status/code/desc; DEBUG for cache-hit fallbacks. - NotificationDispatcher.dispatch: generates a dispatch_id, binds it, logs start/finish with failure count, uses exc_info for target failures. - commands/handler: missing/failed templates -> ERROR + exc_info; send_reply and send_media_group errors upgraded WARNING -> ERROR with chat/error_code context; rate-limit and truncation cases logged with full context. - commands/webhook and services/telegram_poller: bind_log_context(request_id =tg:<update_id>, command, chat_id, bot_id), INFO on receive/dispatch/ completion with duration, exc_info on raise, INFO when commands disabled. - commands/immich: INFO when album scope is empty; WARN per asset dropped from media payload and a summary WARN when "N assets in, 0 out".
This commit is contained in:
@@ -24,6 +24,10 @@ _SETTING_KEYS = {
|
||||
"telegram_asset_cache_max_entries": None, # LRU cap for both caches
|
||||
"supported_locales": None, # comma-separated locale codes
|
||||
"timezone": "NOTIFY_BRIDGE_TIMEZONE", # IANA tz (e.g. "Europe/Warsaw"); empty = UTC
|
||||
# Logging — applied live via apply_log_levels() when changed.
|
||||
"log_level": "NOTIFY_BRIDGE_LOG_LEVEL", # DEBUG/INFO/WARNING/ERROR
|
||||
"log_format": "NOTIFY_BRIDGE_LOG_FORMAT", # text|json (requires restart to switch)
|
||||
"log_levels": "NOTIFY_BRIDGE_LOG_LEVELS", # module=LEVEL,module2=LEVEL
|
||||
}
|
||||
|
||||
_DEFAULTS = {
|
||||
@@ -35,12 +39,20 @@ _DEFAULTS = {
|
||||
"telegram_asset_cache_max_entries": "5000",
|
||||
"supported_locales": "en,ru",
|
||||
"timezone": "UTC",
|
||||
"log_level": "INFO",
|
||||
"log_format": "text",
|
||||
"log_levels": "",
|
||||
}
|
||||
|
||||
# Settings whose changes require dropping in-memory Telegram caches so the
|
||||
# next dispatch rebuilds them with the new parameters. Files are preserved.
|
||||
_CACHE_SETTING_KEYS = {"telegram_cache_ttl_hours", "telegram_asset_cache_max_entries"}
|
||||
|
||||
# Settings that change logging behaviour. ``log_level`` + ``log_levels`` apply
|
||||
# live via apply_log_levels(); ``log_format`` requires a restart because
|
||||
# changing it means swapping the handler formatter entirely.
|
||||
_LOG_SETTING_KEYS = {"log_level", "log_levels", "log_format"}
|
||||
|
||||
|
||||
async def get_setting(session: AsyncSession, key: str) -> str:
|
||||
"""Read a setting from DB, falling back to env var then default."""
|
||||
@@ -66,6 +78,9 @@ class SettingsUpdate(BaseModel):
|
||||
telegram_asset_cache_max_entries: int | str | None = None
|
||||
supported_locales: str | None = None
|
||||
timezone: str | None = None
|
||||
log_level: str | None = None
|
||||
log_format: str | None = None
|
||||
log_levels: str | None = None
|
||||
|
||||
|
||||
@router.get("")
|
||||
@@ -95,6 +110,7 @@ async def update_settings(
|
||||
old_secret = await get_setting(session, "telegram_webhook_secret")
|
||||
old_cache_values = {k: await get_setting(session, k) for k in _CACHE_SETTING_KEYS}
|
||||
old_timezone = await get_setting(session, "timezone")
|
||||
old_log_values = {k: await get_setting(session, k) for k in _LOG_SETTING_KEYS}
|
||||
|
||||
for key in _SETTING_KEYS:
|
||||
value = getattr(body, key, None)
|
||||
@@ -130,6 +146,25 @@ async def update_settings(
|
||||
new_base_url = await get_setting(session, "external_url")
|
||||
new_secret = await get_setting(session, "telegram_webhook_secret")
|
||||
new_timezone = await get_setting(session, "timezone")
|
||||
new_log_values = {k: await get_setting(session, k) for k in _LOG_SETTING_KEYS}
|
||||
|
||||
# Apply live log-level changes (log_format still needs a restart).
|
||||
if (new_log_values["log_level"] != old_log_values["log_level"]
|
||||
or new_log_values["log_levels"] != old_log_values["log_levels"]):
|
||||
from ..logging_setup import apply_log_levels
|
||||
apply_log_levels(
|
||||
level=new_log_values["log_level"] or None,
|
||||
per_module_levels=new_log_values["log_levels"],
|
||||
)
|
||||
_LOGGER.info(
|
||||
"Log levels updated: root=%s overrides=%r",
|
||||
new_log_values["log_level"], new_log_values["log_levels"],
|
||||
)
|
||||
if new_log_values["log_format"] != old_log_values["log_format"]:
|
||||
_LOGGER.warning(
|
||||
"log_format changed from %r to %r — restart the server for it to take effect",
|
||||
old_log_values["log_format"], new_log_values["log_format"],
|
||||
)
|
||||
|
||||
# Cron triggers freeze their timezone at construction time, so a tz change
|
||||
# has no effect until the jobs are rebuilt — do that here, before we
|
||||
@@ -199,7 +234,10 @@ async def _reregister_webhooks(
|
||||
if res.get("success"):
|
||||
_LOGGER.info("Re-registered webhook for bot %d (%s)", bot.id, bot.name)
|
||||
else:
|
||||
_LOGGER.warning(
|
||||
"Failed to re-register webhook for bot %d: %s",
|
||||
bot.id, res.get("error"),
|
||||
# Webhook re-register failure means the bot silently stops
|
||||
# delivering updates — this is operational visibility for an
|
||||
# admin, ERROR is appropriate.
|
||||
_LOGGER.error(
|
||||
"Failed to re-register webhook for bot %d (%s): %s",
|
||||
bot.id, bot.name, res.get("error"),
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user